In [3]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, \
    TrainingArguments
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
from pathlib import Path
from pynvml import *
import torch
import argparse
import sys
import random
import pandas as pd
import numpy as np
import itertools
import evaluate
import math

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128)


# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     scmetrics.add_batch(predictions=predictions, references=labels)
#     return scmetrics.compute()

In [4]:
# parser = argparse.ArgumentParser(description='Task-adaptive model')
# parser.add_argument('--model', help='Path to BERT-like model')
# parser.add_argument('--model_name',
#                     help='Name of the BERT-like model. Default = "" which corresponds to ClinicalBERT',
#                     default='')
# config = parser.parse_args(sys.argv[1:])
task = 'sentiment' 
MODEL = 'UFNLP/gatortron-base'

#MODEL = f"cardiffnlp/twitter-roberta-base-{task}" #minerva: download model from hugging face and put in folder, update to path

# set seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x7fa20066e3f0>

In [5]:
# Create task Dataset
task_dt_path = 'task_adapt_sentences.csv'
sentences = pd.read_csv(task_dt_path, header=None)
sentences = sentences[0].str.lower().str.split('.').values.tolist()
sentences = list(itertools.chain.from_iterable(sentences))

train, dev = train_test_split(sentences, test_size=0.2, random_state=42)
# dev, test = train_test_split(test, test_size=0.5, random_state=42)

task_dt = {'train': {},
           'dev': {}}
           #'test': {}}

for s in train:
    task_dt['train'].setdefault('text', list()).append(s)

for s in dev:
    task_dt['dev'].setdefault('text', list()).append(s)

task_dt = DatasetDict({k: Dataset.from_dict(task_dt[k]) for k in task_dt.keys() if k != 'test'})
task_dt.flatten()

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.add_tokens(['[DATE]', '[TIME]'], special_tokens=True)
tkn_dt = task_dt.map(tokenize_function, batched=True, num_proc=4)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Downloading (…)lve/main/config.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/379k [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/346 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/87 [00:00<?, ? examples/s]

In [6]:
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL,
        from_tf=False).to('cuda')
    print_gpu_utilization()
else:
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL,
        from_tf=False)
model.resize_token_embeddings(len(tokenizer))

batch_size = 4 # [4, 8, 16]
logging_steps = len(task_dt["train"]) // batch_size

training_args = TrainingArguments(
    output_dir="runs/ta_pretraining",
    evaluation_strategy="epoch",
    learning_rate=5e-6, # [5e-6, 1e-5]
    num_train_epochs=5, 
    weight_decay=0.01, # [1e-6, 1e-4, 1e-8]
    warmup_ratio=0.01, #[1e-6, 1e-4, 1e-8]
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='epoch',
    save_total_limit=2,
    logging_steps=logging_steps,
    adam_epsilon=1e-6, # try to add AdamW optimizer post changing the batch size
    seed=42
) # seed = 42, data_seed = 42

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tkn_dt['train'],
    eval_dataset=tkn_dt['dev'],
    data_collator=data_collator   
)

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Downloading pytorch_model.bin:   0%|          | 0.00/713M [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Perplexity: 8.53


In [7]:
result = trainer.train() #resume_from_checkpoint=True)
print(result)
#print_summary(result)



Epoch,Training Loss,Validation Loss
1,2.2196,1.611951
2,1.9438,1.812136
3,1.9192,1.327535
4,1.9383,1.818197
5,1.6521,1.596408


TrainOutput(global_step=435, training_loss=1.9247360437765888, metrics={'train_runtime': 3663.7745, 'train_samples_per_second': 0.472, 'train_steps_per_second': 0.119, 'total_flos': 186999608016000.0, 'train_loss': 1.9247360437765888, 'epoch': 5.0})


In [8]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 4.89


In [74]:
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL,
        from_tf=False).to('cuda')
    print_gpu_utilization()
else:
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL,
        from_tf=False)
model.resize_token_embeddings(len(tokenizer))

params = {
    'batch_size': [4, 8, 16], #[2, 4, 8],
    'learning_rate': [5e-6, 1e-5], #[5e-6, 1e-5, 2e-5, 5e-5, 1e-4],
} 

metrics_file = f'task_adaptation_metrics_GTron.csv'
if os.path.isfile(metrics_file):
    f = open(metrics_file, 'a')
else:
    f = open(metrics_file, 'w')
    f.write('batch_size,learning_rate,train_loss,eval_loss,ppl\n')

best_model = []
best_loss = 10
tmp_trainer, tmp_comb = None, None
for comb in list(ParameterGrid(params)):
    print(f"Parameters: {comb}")
    
    training_args = TrainingArguments(
        output_dir="runs/ta_pretraining/GTron",
        evaluation_strategy="epoch",
        learning_rate=comb['learning_rate'],
        num_train_epochs=5, 
        weight_decay=0.01,
        warmup_ratio=0.01,
        per_device_train_batch_size=comb['batch_size'],
        per_device_eval_batch_size=comb['batch_size'],
        save_strategy='epoch',
        save_total_limit=2,
        logging_steps= len(task_dt["train"]) // comb['batch_size'],
        adam_epsilon=1e-6, # try to add AdamW optimizer post changing the batch size
        seed=42
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tkn_dt['train'],
        eval_dataset=tkn_dt['dev'],
        data_collator=data_collator   
    )

    result = trainer.train()
    print(result)
    eval_results = trainer.evaluate()

    v = [comb['batch_size'], comb['learning_rate'], result.metrics['train_loss'], eval_results['eval_loss'], math.exp(eval_results['eval_loss'])]
    f.write(','.join([str(el) for el in v]) + '\n')

    if eval_results['eval_loss'] < best_loss:
        best_loss = eval_results['eval_loss']
        tmp_trainer = trainer
        tmp_comb = comb
    print('-' * 100)
    print('\n\n')

f.close()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaForMaskedLM: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
You should 

Parameters: {'batch_size': 4, 'learning_rate': 5e-06}




Epoch,Training Loss,Validation Loss
1,12.0849,10.296826
2,9.8405,8.993443
3,8.6037,8.292106
4,8.2559,8.105609
5,7.8843,8.26391


TrainOutput(global_step=435, training_loss=9.319472126577093, metrics={'train_runtime': 731.233, 'train_samples_per_second': 2.366, 'train_steps_per_second': 0.595, 'total_flos': 59358136312800.0, 'train_loss': 9.319472126577093, 'epoch': 5.0})


----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 4, 'learning_rate': 1e-05}




Epoch,Training Loss,Validation Loss
1,7.657,7.692203
2,7.3477,7.209986
3,6.8387,6.910732
4,6.7643,6.84383
5,6.5113,6.80808


TrainOutput(global_step=435, training_loss=7.02421602271069, metrics={'train_runtime': 778.6368, 'train_samples_per_second': 2.222, 'train_steps_per_second': 0.559, 'total_flos': 59358136312800.0, 'train_loss': 7.02421602271069, 'epoch': 5.0})


----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 8, 'learning_rate': 5e-06}




Epoch,Training Loss,Validation Loss
1,6.7995,6.882984
2,6.5928,6.570981
3,6.7172,6.812841
4,6.6363,6.440327
5,6.6132,6.222712


TrainOutput(global_step=220, training_loss=6.666054708307439, metrics={'train_runtime': 699.7053, 'train_samples_per_second': 2.472, 'train_steps_per_second': 0.314, 'total_flos': 76571173138920.0, 'train_loss': 6.666054708307439, 'epoch': 5.0})


----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 8, 'learning_rate': 1e-05}




Epoch,Training Loss,Validation Loss
1,6.1193,6.573441
2,5.9304,6.179474
3,6.1011,6.44016
4,6.0426,6.005545
5,6.0705,5.720152


TrainOutput(global_step=220, training_loss=6.046336962959983, metrics={'train_runtime': 723.4199, 'train_samples_per_second': 2.391, 'train_steps_per_second': 0.304, 'total_flos': 76571173138920.0, 'train_loss': 6.046336962959983, 'epoch': 5.0})


----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 16, 'learning_rate': 5e-06}




Epoch,Training Loss,Validation Loss
1,6.1184,6.219065
2,6.0872,6.319373
3,5.9236,5.75452
4,5.8149,6.183167
5,5.8979,5.697608


TrainOutput(global_step=110, training_loss=5.970101131092418, metrics={'train_runtime': 775.5768, 'train_samples_per_second': 2.231, 'train_steps_per_second': 0.142, 'total_flos': 94336450422420.0, 'train_loss': 5.970101131092418, 'epoch': 5.0})


----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 16, 'learning_rate': 1e-05}




Epoch,Training Loss,Validation Loss
1,5.624,6.124918
2,5.6944,6.129557
3,5.5696,5.568994
4,5.4969,6.021206
5,5.6059,5.480842


TrainOutput(global_step=110, training_loss=5.605954707752574, metrics={'train_runtime': 753.9493, 'train_samples_per_second': 2.295, 'train_steps_per_second': 0.146, 'total_flos': 94336450422420.0, 'train_loss': 5.605954707752574, 'epoch': 5.0})


----------------------------------------------------------------------------------------------------



