In [1]:
import transformers
import tokenizers
import gc
import torch
import os
import pandas as pd
import re
import math

from tqdm import tqdm

In [2]:
torch.cuda.is_available(), torch.cuda.is_initialized()

(False, False)

In [3]:
'''
sudo rmmod nvidia-uvm
sudo modprobe nvidia-uvm
'''

'\nsudo rmmod nvidia-uvm\nsudo modprobe nvidia-uvm\n'

In [4]:
TRAIN_EPOCHS = 5
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-3
GRAD_ACC_STEPS = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 4
MAX_LEN = 64
train_frac = 0.9

In [5]:
model_name = 'belabbert' # belabbert, medroberta, robbert, bertje
model_suffix = 'dapt_umcu_v1'
trainsets = ['HMC_decursus', 'HMC_radio', 'nvr', 'nvog', 'nvmdl', 'fms', 'CCN', 'journals', 'dhd', 'dictionaries', 'nvr', 'nvog', 'nvmdl', 'fms', 'nvvc', 'www', 'corpora', 'ARGUS', 'RA', 'IBD', 'books']
ignore_line = '---NEW DOCUMENT---'
output_dir = '/media/koekiemonster/DATA-FAST1/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/DAPT'
model_folder = os.path.join(output_dir, model_name+model_suffix)

In [6]:
os.chdir("/media/koekiemonster/DATA-FAST1/text_data/")

In [7]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaTokenizer, RobertaForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("word_vectors_and_language_models/dutch/Medical/languagemodels/base/"+model_name)
model = AutoModelForMaskedLM.from_pretrained("word_vectors_and_language_models/dutch/Medical/languagemodels/base/"+model_name)

In [8]:
print('Num parameters: ',model.num_parameters())

Num parameters:  116803648


In [9]:
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer,TrainingArguments
from datasets import load_dataset, load_from_disk, load
from torch.utils.data.dataset import Dataset

In [10]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.
        self.examples = []
        self.failed_docs = []
        # For every value in the dataframe 
        for example in tqdm(df.values):
            # 
            try:
                x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
                self.examples += [x.input_ids]
            except:
                self.failed_docs.append(example)
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [11]:
# ADD simple strider that respects sentences
re_splitter = re.compile(r'\s')
re_sentencer = re.compile(r'[\r\n\.]+')
def get_strides(txt, max_len=496):
    tkns = re_splitter.split(txt.strip())

    docs = []
    _docs = []
    jdx = 0
    last_sent_idx = None
    for idx, tkn in enumerate(tkns, start=1):
        _docs.append(tkn)
        jdx = jdx+1
        if "." in  tkn:
            last_sent_idx = jdx            
        if (jdx%max_len==0):
            if last_sent_idx is None:
                last_sent_idx = jdx
            docs.append(_docs[:last_sent_idx+1])
            jdx=0
            _docs = _docs[last_sent_idx+1:]
    docs.append(_docs)
    return docs

In [12]:
# add last minute cleaners
re_mspace = re.compile(r'\s{2,}')
re_brackets = re.compile(r'[\[\]\<\>]')
re_integer = re.compile(r'\#+')
re_float = re.compile(r'\#+[\,\.]\#*')
re_vert =  re.compile(r'\|')

In [13]:
# UPDATE
docs = []
for s in trainsets:    
    with open('pubscience/COLLECTIONS/'+s+'.txt', 'r') as reader:
        _docs = reader.readlines()        
        _docs = [re_brackets.sub(' ', s) for s in _docs if  s.strip()!=ignore_line]
        _docs = [re_integer.sub('INT', s) for s in _docs]
        _docs = [re_float.sub('FLOAT', s) for s in _docs]
        _docs = [re_vert.sub(" ", s) for s in _docs]
        _docs = [re_mspace.sub(' ', s) for s in _docs]
        _docs = [_s for s in _docs for _s in get_strides(s, max_len=MAX_LEN)]
    docs.extend([t for t in _docs])

In [14]:
# do stuff with docs easy with list in list format
docs_joined = [" ".join(tlist) for tlist in docs]
del docs, _docs
gc.collect()

0

In [15]:
dataset = pd.DataFrame({"doc":pd.Series(docs_joined)})
dataset = dataset.sample(frac=1)

In [16]:
gc.collect()

0

In [17]:
dataset.reset_index(drop=True).to_csv("pubscience/COLLECTIONS/compressed/clinical_corpus_v1.csv.zip", 
                                      index=False, compression='zip')

In [18]:
train_dataset = CustomDataset(dataset.iloc[:int(dataset.shape[0]*train_frac)].doc, tokenizer)
test_dataset = CustomDataset(dataset.iloc[int(dataset.shape[0]*train_frac):].doc, tokenizer)

100%|██████████| 1051486/1051486 [41:57<00:00, 417.64it/s] 
100%|██████████| 116832/116832 [04:44<00:00, 411.32it/s]


In [19]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)

In [20]:
training_args = TrainingArguments(
        output_dir = model_folder,
        evaluation_strategy = 'epoch',
        gradient_accumulation_steps=GRAD_ACC_STEPS,
        num_train_epochs=TRAIN_EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=VALID_BATCH_SIZE,
        save_steps=8192,
        fp16=True,
        #eval_steps=4096,
        save_total_limit=1,
)

In [21]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=test_dataset)

Using cuda_amp half precision backend


In [38]:
trainer.train()
eval_results = trainer.evaluate()

***** Running training *****
  Num examples = 1051486
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 128
  Total optimization steps = 2053


Epoch,Training Loss,Validation Loss
0,1.1331,1.050987


***** Running Evaluation *****
  Num examples = 116832
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 116832
  Batch size = 4


In [39]:
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 2.86


In [40]:
eval_results

{'eval_loss': 1.0498334169387817,
 'eval_runtime': 1122.8279,
 'eval_samples_per_second': 104.052,
 'eval_steps_per_second': 26.013,
 'epoch': 1.0}

In [41]:
trainer.save_model(model_folder)
trainer.create_model_card()

Saving model checkpoint to /media/koekiemonster/DATA-FAST1/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/DAPT/robbertdapt_umcu_v1
Configuration saved in /media/koekiemonster/DATA-FAST1/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/DAPT/robbertdapt_umcu_v1/config.json
Model weights saved in /media/koekiemonster/DATA-FAST1/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/DAPT/robbertdapt_umcu_v1/pytorch_model.bin
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}


## Test masks

In [42]:
test_input = tokenizer("De patient heeft last van hartkloppingen en vage <mask>.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0][1:-1], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

- Ġman Ġheeft Ġlast Ġvan Ġhart st ig Ġen Ġvage Ġklachten .


In [43]:
test_input = tokenizer("De darmontstekingen zijn gelokaliseerd in de <mask>.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0][1:-1], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġde Ġmaag verd jes Ġis Ġben on eerd Ġin Ġde Ġbuik .


In [44]:
test_input = tokenizer("De <mask> zijn gelokaliseerd in de buik.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0][1:-1], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

ĠDe Ġnieren Ġis Ġben of eerd Ġin Ġde Ġbuik .


In [45]:
test_input = tokenizer("Er is gecalcificeerde<mask> in de aderen.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0][1:-1], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġer Ġis Ġc al c eerde Ġontsteking Ġin Ġde ĠA ders .


In [46]:
test_input = tokenizer("Geen atherosclerose, stenose of <mask>. Ventrikelseptum meet INT,INT mm,\
                       laterale wand INT,INTm, LV INTmm, RV INT mm op de axiale coupes in deze mid diastolische fase. Geen <MASK> zichtbaar.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0][1:-1], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

In Ġis ter Ġs or ose , Ġresten ologie Ġof Ġartrose . ĠOp r el ut on Ġmeet ĠIN t . IN m . ĠL aal , Ġlater se Ġwand Ġ( t . T m . ĠP V ĠIN T mm . ĠV ĠIN T Ġm Ġop Ġde ĠA x ale Ġcoup e Ġin Ġdeze Ġmid Ġdi st ole Ġperiode . ĠDe Ġ? m K k > Ġzichtbaar .


In [50]:
test_input = tokenizer("Thorax: Er is wel sprake van aortaklepstenose. Er is <mask> sprake van aortaklepstenose.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_hidden_states=True, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0][1:-1], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

t or ax . Ġer Ġis Ġwel Ġsprake Ġvan ĠA eerd a le st ose . Ġer Ġis Ġgeen Ġsprake Ġvan ĠA eert a le ste ie .


In [48]:
word_idcs = torch.argsort(proba[0][1:-1], axis=1)[:,-1]
print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Pan Hor ax Ġ: ĠHierbij Ġlijkt Ġsprake Ġvan ĠSten osie Ġin Ġde Ġan ort Ġak lep . ĠHiervan Ġlijkt Ġgeen Ġsprake Ġvan Ġa ort Ġak lep Ġsten rose .


In [36]:
if model_name=='bertje':
    last_hidden_state = output.hidden_states[-1]
    sentence_embedding = last_hidden_state[0][0]