In [1]:
import transformers
import tokenizers
import gc
import torch
import os
import pandas as pd
import re

In [2]:
TRAIN_EPOCHS = 10
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-3
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
MAX_LEN = 128

train_frac = 0.9

In [3]:
model_name = 'robbert' # belabbert
model_suffix = '_books'
trainsets = ['nvr', 'nvog'] #, 'journals', 'dhd', 'dictionaries', 'nvr', 'nvog', 'nvmdl', 'fms', 'nvvc', 'www', 'corpora', 'ARGUS']
split_on = '---NEW DOCUMENT---'
output_dir = '/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/'
model_folder = os.path.join(output_dir, model_name+model_suffix)

In [4]:
os.chdir("/media/koekiemonster/DATA-FAST/text_data/")

In [8]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
if model_name == 'robbert':
    tokenizer = AutoTokenizer.from_pretrained("word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base")
    model = AutoModelForMaskedLM.from_pretrained("word_vectors_and_language_models/dutch/Medical/languagemodels/robbert-v2-dutch-base")
elif model_name == 'belabbert':
    tokenizer = AutoTokenizer.from_pretrained("pdelobelle/jwouts/belabBERT_115k")
    model = AutoModelForMaskedLM.from_pretrained("pdelobelle/jwouts/belabBERT_115k")    

In [9]:
print('Num parameters: ',model.num_parameters())

Num parameters:  116803648


In [10]:
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer,TrainingArguments
from datasets import load_dataset, load_from_disk, load
from torch.utils.data.dataset import Dataset

In [11]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.
        self.examples = []
        self.failed_docs = []
        # For every value in the dataframe 
        for example in df.values:
            # 
            try:
                x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
                self.examples += [x.input_ids]
            except:
                self.failed_docs.append(example)
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [None]:
# ADD simple strider that respects sentences
re_splitter = re.compile(r'\s')
def get_strides(txt, stride=496, max_len=512):
    tkns = re_splitter.split(txt)
    docs = []
    for idx in range(len(tkns)):
        if (idx%stride==0) & (idx>0):
            docs.append(tkns[])

In [16]:
# add last minute cleaners
re_mspace = re.compile(r'\s{2,}')

In [18]:
# UPDATE
docs = []
for s in trainsets:    
    with open('pubscience/COLLECTIONS/'+s+'.txt', 'r') as reader:
        text = reader.read()
        _docs = text.split(split_on)
        _docs = [re_mspace.sub(' ', s) for s in _docs]
        _docs = [get_strides(s, stride=MAX_LEN-6, max_len=MAX_LEN) for s in _docs]
    docs.extend([s for s in t for t in _docs])
    

NameError: name 'get_strides' is not defined

In [11]:
dataset = pd.DataFrame({"doc":pd.Series(docs)})
dataset = dataset.sample(frac=1)

In [12]:
train_dataset = CustomDataset(dataset.iloc[:int(dataset.shape[0]*train_frac)].doc, tokenizer)
test_dataset = CustomDataset(dataset.iloc[int(dataset.shape[0]*train_frac):].doc, tokenizer)

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)

In [14]:
training_args = TrainingArguments(
        output_dir = model_folder,
        evaluation_strategy = 'epoch',
        num_train_epochs=TRAIN_EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=VALID_BATCH_SIZE,
        save_steps=8192,
        #eval_steps=4096,
        save_total_limit=1,
)

In [15]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=test_dataset)

In [18]:
torch.cuda.is_available(), torch.cuda.is_initialized()

(True, True)

In [16]:
trainer.train()
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running training *****
  Num examples = 7465
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2340
  0%|          | 0/2340 [00:00<?, ?it/s]

RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable

In [None]:
trainer.save_model(model_folder)