In [1]:
import transformers
import tokenizers
import gc
import torch
import os
import pandas as pd
import re
import math

from tqdm import tqdm

In [2]:
torch.cuda.is_available(), torch.cuda.is_initialized()

(True, False)

In [3]:
'''
sudo rmmod nvidia-uvm
sudo modprobe nvidia-uvm
'''

'\nsudo rmmod nvidia-uvm\nsudo modprobe nvidia-uvm\n'

In [4]:
TRAIN_EPOCHS = 10
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-3
GRAD_ACC_STEPS = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
MAX_LEN = 512
train_frac = 0.9

In [5]:
model_name = 'medroberta_journals_ARGUS_nvvc_IBD_RA' # belabbert #medroberta
model_suffix = '_HMC'
trainsets = ['HMC'] #, 'journals', 'dhd', 'dictionaries', 'nvr', 'nvog', 'nvmdl', 'fms', 'nvvc', 'www', 'corpora', 'ARGUS', HMC', 'RA', 'IBD']
split_on = '---NEW DOCUMENT---'
output_dir = '/media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/'
model_folder = os.path.join(output_dir, model_name+model_suffix)

In [6]:
os.chdir("/media/koekiemonster/DATA-FAST/text_data/")

In [7]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaTokenizer, RobertaForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("word_vectors_and_language_models/dutch/Medical/languagemodels/"+model_name)
model = AutoModelForMaskedLM.from_pretrained("word_vectors_and_language_models/dutch/Medical/languagemodels/"+model_name)
   

In [8]:
print('Num parameters: ',model.num_parameters())

Num parameters:  126031648


In [9]:
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer,TrainingArguments
from datasets import load_dataset, load_from_disk, load
from torch.utils.data.dataset import Dataset

In [10]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.
        self.examples = []
        self.failed_docs = []
        # For every value in the dataframe 
        for example in tqdm(df.values):
            # 
            try:
                x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
                self.examples += [x.input_ids]
            except:
                self.failed_docs.append(example)
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [11]:
# ADD simple strider that respects sentences
re_splitter = re.compile(r'\s')
re_sentencer = re.compile(r'[\r\n\.]+')
def get_strides(txt, max_len=496):
    tkns = re_splitter.split(txt.strip())

    docs = []
    _docs = []
    jdx = 0
    last_sent_idx = None
    for idx, tkn in enumerate(tkns, start=1):
        _docs.append(tkn)
        jdx = jdx+1
        if "." in  tkn:
            last_sent_idx = jdx            
        if (idx%max_len==0):
            if last_sent_idx is None:
                last_sent_idx = jdx
            docs.append(_docs[:last_sent_idx+1])
            jdx=0
            _docs = _docs[last_sent_idx+1:]
    docs.append(_docs)
    return docs

In [12]:
# add last minute cleaners
re_mspace = re.compile(r'\s{2,}')
re_brackets = re.compile(r'[\[\]\<\>]')
re_integer = re.compile(r'\#+')
re_float = re.compile(r'\#+\,\#*')
re_vert =  re.compile(r'\|')

In [13]:
# UPDATE
docs = []
for s in trainsets:    
    with open('pubscience/COLLECTIONS/'+s+'.txt', 'r') as reader:
        _docs = reader.readlines()
        _docs = [re_mspace.sub(' ', s) for s in _docs]
        _docs = [re_brackets.sub('', s) for s in _docs]
        _docs = [re_integer.sub('INT', s) for s in _docs]
        _docs = [re_float.sub('FLOAT', s) for s in _docs]
        _docs = [re_vert.sub(" ", s) for s in _docs]
        _docs = [_s for s in _docs for _s in get_strides(s, max_len=MAX_LEN)]
    docs.extend([t for t in _docs])

In [14]:
# do stuff with docs easy with list in list format
docs_joined = [" ".join(tlist) for tlist in docs]
del docs, _docs
gc.collect()

0

In [15]:
dataset = pd.DataFrame({"doc":pd.Series(docs_joined)})
dataset = dataset.sample(frac=1)

In [16]:
train_dataset = CustomDataset(dataset.iloc[:int(dataset.shape[0]*train_frac)].doc, tokenizer)
test_dataset = CustomDataset(dataset.iloc[int(dataset.shape[0]*train_frac):].doc, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 73908/73908 [02:45<00:00, 447.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8213/8213 [00:18<00:00, 450.86it/s]


In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)

In [20]:
training_args = TrainingArguments(
        output_dir = model_folder,
        evaluation_strategy = 'epoch',
        gradient_accumulation_steps=GRAD_ACC_STEPS,
        num_train_epochs=TRAIN_EPOCHS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=VALID_BATCH_SIZE,
        save_steps=8192,
        fp16=True,
        #eval_steps=4096,
        save_total_limit=1,
)

In [21]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  data_collator=data_collator, 
                  train_dataset=train_dataset, 
                  eval_dataset=test_dataset)

Using amp fp16 backend


In [22]:
trainer.train()
eval_results = trainer.evaluate()

***** Running training *****
  Num examples = 73908
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 128
  Total optimization steps = 1440


Epoch,Training Loss,Validation Loss
0,No log,1.47181
1,No log,1.378278
2,No log,1.323844
3,1.524000,1.299168
4,1.524000,1.274145
5,1.524000,1.25291
6,1.334300,1.242853
7,1.334300,1.235181
8,1.334300,1.228761
9,1.334300,1.227015


***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4
***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 8213
  Batch size = 4


In [23]:
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 3.42


In [24]:
trainer.save_model(model_folder)
trainer.create_model_card()

Saving model checkpoint to /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/medroberta_journals_ARGUS_nvvc_journals_ARGUS_nvvc_IBD_RA
Configuration saved in /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/medroberta_journals_ARGUS_nvvc_journals_ARGUS_nvvc_IBD_RA/config.json
Model weights saved in /media/koekiemonster/DATA-FAST/text_data/word_vectors_and_language_models/dutch/Medical/languagemodels/medroberta_journals_ARGUS_nvvc_journals_ARGUS_nvvc_IBD_RA/pytorch_model.bin
Dropping the following result as it does not have all the necessary field:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}


## Test masks

In [25]:
test_input = tokenizer("De patient heeft last van hartkloppingen en <mask>.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġnog De Ġpatient Ġheeft Ġlast Ġvan Ġhartkloppingen Ġen Ġduizeligheid . Ġ.


In [26]:
test_input = tokenizer("De darmontstekingen zijn gelokaliseerd in de <mask>.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġnog De Ġdarm ontstekingen Ġzijn Ġgelokaliseerd Ġin Ġde Ġbuik . ĠP


In [28]:
test_input = tokenizer("De <mask> zijn gelokaliseerd in de buik.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġnog De Ġpijnklachten Ġzijn Ġgelokaliseerd Ġin Ġde Ġbuik . ĠP


In [32]:
test_input = tokenizer("Er is gecalcificeerde <mask> in de aderen.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġnog ĠEr Ġis Ġgecalcificeerde Ġplaque Ġin Ġde Ġa deren . Ġ.


In [34]:
test_input = tokenizer("Geen atherosclerose of <mask>. Ventrikelseptum meet INT,INT mm, laterale wand INT,INTm, LV INTmm, RV INT mm op de axiale coupes in deze mid diastolische fase. Geen <mask> zichtbaar.", return_tensors='pt')
test_input.to(device='cuda:0')
output = trainer.model.forward(**test_input, output_attentions=True )

odds = torch.exp(output.logits)
proba = odds/(1+odds)
word_idcs = torch.argmax(proba[0], axis=1)

print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġnog Geen Ġatherosclerose Ġof Ġstenose . ĠV entrikel septum Ġmeet ĠINT , INT Ġmm , Ġlaterale Ġwand ĠINT , INT m , ĠLV ĠINT mm , ĠRV ĠINT Ġmm Ġop Ġde Ġaxiale Ġcoupes Ġin Ġdeze Ġmid Ġdiastolische Ġfase . ĠGeen ĠPFO Ġzichtbaar . ĠP


In [45]:
word_idcs = torch.argsort(proba[0], axis=1)[:,-2]
print(" ".join(tokenizer.convert_ids_to_tokens(word_idcs)))

Ġcoronair ĠGeen Ġcoronairsclerose Ġen Ġverkalkingen ? V entrik septumdefect Ġdiameter INT . ĠINT mm Ġen ĠLaterale Ġdoorsnede INT . ĠINT Ġm Ġen ĠRV INT cm . ĠLV INT mm op ĠDe Ġcoronale Ġcoupe in Ġdezelfde ĠMid Ġsystolische fase Ġ: Geen Ġstenose Ġaanwezig Ġ. ĠINT
