In [1]:
from transformers import BertModel, BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

import torch
import numpy as np
import spacy
import re

from datetime import datetime

In [2]:
# pos_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
# ner_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

In [3]:
sentences = ["When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.",
 'These facts seemed to me to throw some light on the origin of species--that mystery of mysteries, as it has been called by one of our greatest philosophers.',
 'On my return home, it occurred to me, in 1837, that something might perhaps be made out on this question by patiently accumulating and reflecting on all sorts of facts which could possibly have any bearing on it.',
 "After five years' work I allowed myself to speculate on the subject, and drew up some short notes; these I enlarged in 1844 into a sketch of the conclusions, which then seemed to me probable: from that period to the present day I have steadily pursued the same object.",
 'I hope that I may be excused for entering on these personal details, as I give them to show that I have not been hasty in coming to a decision.']

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load("en_core_web_sm")

In [5]:
tokenizer.model_max_length

512

In [6]:
def whole_word_MO_tokenization_and_masking(tokenizer, nlp_model, sequence: str):
        """
        posoi: Part-Of-Speech of interest
        
        Performs whole-word-masking based on selected posoi.
        
        POS possibilities:['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                             
        TODO: What if no tokens are masked?
        
        """
        print('loading:', datetime.now().time())
        spacy_sentence = nlp_model(sequence, disable=["parser"])
        
        POS_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        NER_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 
                    'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
        NER_pairs = ['']
        
        input_ids = tokenizer.encode(sequence, add_special_tokens=False)
        input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        print(sequence)
        print(input_tokens)
        sequence_pos_list = [token.pos_ for token in spacy_sentence]
        sequence_pos_frequency = {pos: sequence_pos_list.count(pos) for pos in sequence_pos_list}
        
        modified_input_list = []
        
        #POS-masking
        print('pos-start:', datetime.now().time())
        for posoi in sequence_pos_frequency.keys():
            posoi_vocab = [token.text for token in spacy_sentence if token.pos_ == posoi]
            
            mask_indices = []
            composite_word_indices = []
            composite_word_tokens = []
            for (i, token) in enumerate(input_tokens):
                if token == "[CLS]" or token == "[SEP]":
                    continue
                elif token.startswith("##"):
                    composite_word_indices.append(i)
                    composite_word_tokens.append(token)
                    if "".join([x.strip("##") for x in composite_word_tokens]) in posoi_vocab:
                        mask_indices = mask_indices + composite_word_indices

                elif token in posoi_vocab:
                    mask_indices.append(i)
                else:
                    composite_word_indices = [i]
                    composite_word_tokens = [token]

            mask_labels = [1 if i in mask_indices else 0 for i in range(len(input_tokens))]
            masked_tokens = [x if mask_labels[i] == 0 else 103 for i,x in enumerate(input_ids)]
            masked_input = tokenizer.decode(masked_tokens)         
            modified_input_list.append(masked_input)
            
            print(posoi, masked_input)
        
        print('lemma-start:', datetime.now().time())
        #POS-based lemmatization
        replacement_tuples = [(token.text, token.lemma_) for token in spacy_sentence if token.text.lower() != token.lemma_]
        #print(replacement_tuples)
        pos_replaced_sentence = sequence
        for replacement in replacement_tuples:
            pos_replaced_sentence = re.sub(r'\b' + replacement[0] + r'\b', replacement[1], pos_replaced_sentence)

        pos_replaced_sentence = pos_replaced_sentence.replace("  ", " ")
        print('Lemma', pos_replaced_sentence)
        modified_input_list.append(pos_replaced_sentence)
        
        #NER-based swapping of time-place (if present)
        print('ner-start:', datetime.now().time())
        ner_swapped_sentence = spacy_sentence.text
        for ent in spacy_sentence.ents:
            if ent.label_ == 'TIME':
                time_substring = ner_swapped_sentence[ent.start_char:ent.end_char].split(" ")
                time_substring.reverse()
                ner_swapped_sentence = ner_swapped_sentence.replace(ner_swapped_sentence[ent.start_char:ent.end_char], " ".join(time_substring))
        print('NER', ner_swapped_sentence)
        modified_input_list.append(ner_swapped_sentence)
        
        
        #TODO future ideas
        #
        #
        
    
        #actually tokenize input
        inputs = tokenizer(modified_input_list, return_tensors="pt", padding=True)

        inputs['labels'] = tokenizer([sequence for i in range(0,inputs['input_ids'].shape[0])], 
                                     return_attention_mask=False, 
                                     return_token_type_ids=False,
                                     return_tensors='pt', padding=True)['input_ids']
        
        return inputs

In [7]:
print(datetime.now().time())
test_sentence = "Anne went to the Albert Heijn at 5 o'clock to buy some milk for me."
example_sentence_inputs = whole_word_MO_tokenization_and_masking(tokenizer=tokenizer, nlp_model=nlp, sequence=test_sentence)
print(datetime.now().time())

12:17:49.368637
loading: 12:17:49.369633
Anne went to the Albert Heijn at 5 o'clock to buy some milk for me.
['anne', 'went', 'to', 'the', 'albert', 'he', '##ij', '##n', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.']
pos-start: 12:17:49.382632
PROPN anne went to the albert heijn at 5 o'clock to buy some milk for me.
VERB anne [MASK] to the albert heijn at 5 o'clock to [MASK] some milk for me.
ADP anne went [MASK] the albert heijn [MASK] 5 o'clock [MASK] buy some milk [MASK] me.
DET anne went to [MASK] albert heijn at 5 o'clock to buy [MASK] milk for me.
NUM anne went to the albert heijn at [MASK] o'clock to buy some milk for me.
NOUN anne went to the albert heijn at 5 o'clock to buy some [MASK] for me.
PART anne went [MASK] the albert heijn at 5 o'clock [MASK] buy some milk for me.
PRON anne went to the albert heijn at 5 o'clock to buy some milk for [MASK].
PUNCT anne went to the albert heijn at 5 o'clock to buy some milk for me [MASK]
lemma-start: 12:17:4

In [8]:
example_sentence_inputs

{'input_ids': tensor([[  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  201

In [9]:
example_sentence_inputs['input_ids']

tensor([[  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,   103,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,   103,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,   103,  1996,  4789,  2002, 28418,  2078,   103,
          1019,  1051,  1005,  5119,   103,  4965,  2070,  6501,   103,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,   103,  4789,  2002, 28418,  2078,  2012,
          1019,  1051,  1005,  5119,  2000,  4965,   103,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
           103,  1051,  1005,  5119,  2000,  4965,  2070,  6501,  2005,  2033,
          1012,   102],
        [  101,  4776,  2253,  2000,  1996,  4789,  2002, 28418,  2078,  2012,
          1

In [10]:
text= 'On their very first meeting, Gilbert had not been pleasantly impressed with Hardy. But he soon saw that the man had a certain rugged strength, and there was no doubt he had suffered from the depredations of Mexico\'s casual visitors, and was ready to protect not only his own interests but those of any newcomers. He seemed to have the spirit of fair-mindedness; and he believed firmly in the possibilities of this magic land, particularly for young men. "It\'s God\'s country," he told Gilbert on more than one occasion. "Get into the soil all you can. Dig--and dig deep."'

In [11]:
whole_word_MO_tokenization_and_masking(tokenizer=tokenizer, nlp_model=nlp, sequence=text)

loading: 12:17:49.454407
On their very first meeting, Gilbert had not been pleasantly impressed with Hardy. But he soon saw that the man had a certain rugged strength, and there was no doubt he had suffered from the depredations of Mexico's casual visitors, and was ready to protect not only his own interests but those of any newcomers. He seemed to have the spirit of fair-mindedness; and he believed firmly in the possibilities of this magic land, particularly for young men. "It's God's country," he told Gilbert on more than one occasion. "Get into the soil all you can. Dig--and dig deep."
['on', 'their', 'very', 'first', 'meeting', ',', 'gilbert', 'had', 'not', 'been', 'pleasantly', 'impressed', 'with', 'hardy', '.', 'but', 'he', 'soon', 'saw', 'that', 'the', 'man', 'had', 'a', 'certain', 'rugged', 'strength', ',', 'and', 'there', 'was', 'no', 'doubt', 'he', 'had', 'suffered', 'from', 'the', 'de', '##pre', '##dation', '##s', 'of', 'mexico', "'", 's', 'casual', 'visitors', ',', 'and', '

{'input_ids': tensor([[ 101,  103, 2037,  ..., 1012, 1000,  102],
        [ 101, 2006,  103,  ..., 1012, 1000,  102],
        [ 101, 2006, 2037,  ..., 1012, 1000,  102],
        ...,
        [ 101, 2006, 2037,  ..., 1012, 1000,  102],
        [ 101, 2006, 2037,  ...,    0,    0,    0],
        [ 101, 2006, 2037,  ..., 1012, 1000,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ 101, 2006, 2037,  ..., 1012, 1000,  102],
        [ 101, 2006, 2037,  ..., 1012, 1000,  102],
        [ 101, 2006, 2037,  ..., 1012, 1000,  102],
        ...,
      

In [12]:
text2 = '"Sturgis telegraphed me that there was a big possibility of a new vein of oil down on the border," Pell was telling her. "Some important men want to talk things over with me at Bisbee.I want to get started in a day or two.Don\'t take your maid.It\'s a rough country, but you\'ll be all right.Just old clothes.You can ride a lot, so bring your habit.I\'ll be busy most of the time; but I think you\'ll like the trip.Never been down that way, have you?"'
whole_word_MO_tokenization_and_masking(tokenizer=tokenizer, nlp_model=nlp, sequence=text2)

loading: 12:17:49.593037
"Sturgis telegraphed me that there was a big possibility of a new vein of oil down on the border," Pell was telling her. "Some important men want to talk things over with me at Bisbee.I want to get started in a day or two.Don't take your maid.It's a rough country, but you'll be all right.Just old clothes.You can ride a lot, so bring your habit.I'll be busy most of the time; but I think you'll like the trip.Never been down that way, have you?"
['"', 'stu', '##rg', '##is', 'telegraph', '##ed', 'me', 'that', 'there', 'was', 'a', 'big', 'possibility', 'of', 'a', 'new', 'vein', 'of', 'oil', 'down', 'on', 'the', 'border', ',', '"', 'pe', '##ll', 'was', 'telling', 'her', '.', '"', 'some', 'important', 'men', 'want', 'to', 'talk', 'things', 'over', 'with', 'me', 'at', 'bis', '##bee', '.', 'i', 'want', 'to', 'get', 'started', 'in', 'a', 'day', 'or', 'two', '.', 'don', "'", 't', 'take', 'your', 'maid', '.', 'it', "'", 's', 'a', 'rough', 'country', ',', 'but', 'you', "'",

{'input_ids': tensor([[  101,   103, 24646,  ...,   103,   103,   102],
        [  101,  1000, 24646,  ...,  1029,  1000,   102],
        [  101,  1000, 24646,  ...,  1029,  1000,   102],
        ...,
        [  101,  1000, 24646,  ...,  1029,  1000,   102],
        [  101,  1000, 24646,  ...,  1000,   102,     0],
        [  101,  1000, 24646,  ...,  1029,  1000,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[  101,  1000, 24646,  ...,  1029,  1000,   102],
        [  101,  1000, 24646,  ...,  1029,  1000,   102],
        [  101,  1000, 

In [13]:
model = BertForMaskedLM(config=BertConfig())
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

In [14]:
outputs = model(**example_sentence_inputs, return_dict=True)

In [15]:
outputs

MaskedLMOutput(loss=tensor(10.6964, grad_fn=<NllLossBackward>), logits=tensor([[[ 0.7364,  0.2646,  0.1149,  ...,  0.0814,  0.2446,  0.2514],
         [-0.0501, -1.1074,  0.1117,  ...,  0.2692, -0.5424, -0.0633],
         [ 0.7983, -0.4943, -0.4966,  ...,  0.4069, -1.0550,  0.3728],
         ...,
         [ 1.1218,  0.2370,  0.5952,  ...,  0.5059, -0.3531,  1.1506],
         [ 1.1563, -0.7431, -0.4258,  ...,  1.0308, -0.7863,  0.3092],
         [ 0.9667,  0.0130,  0.1943,  ..., -0.0961, -0.8283,  1.1607]],

        [[ 0.6186, -0.2883, -0.0186,  ...,  0.5590,  0.1776,  0.9296],
         [-0.0717, -0.8057, -0.2610,  ...,  0.1767, -1.0513,  0.1761],
         [ 1.1157,  0.0442, -0.1142,  ...,  0.9568, -0.7546,  0.8545],
         ...,
         [ 0.6962,  0.1660,  0.0877,  ...,  0.9253,  0.4090,  0.6689],
         [ 1.4367, -0.1008, -0.0666,  ...,  1.0879, -0.1390,  0.4630],
         [ 1.6386, -0.7541,  0.2593,  ..., -0.0194, -0.9134,  0.8722]],

        [[ 0.8383,  0.1662, -0.0925,  ...,  0

In [16]:
loss = outputs.loss

In [17]:
loss.backward()

In [18]:
optimizer.step()

In [19]:
model(**example_sentence_inputs, return_dict=True)

MaskedLMOutput(loss=tensor(10.0538, grad_fn=<NllLossBackward>), logits=tensor([[[ 1.3905, -0.2540, -0.0298,  ...,  0.8064,  0.4195,  0.3095],
         [ 0.9665, -0.3469,  0.1393,  ...,  0.4941, -0.9038,  0.6805],
         [ 0.3983, -0.5277, -0.1270,  ...,  0.3952, -0.4609,  0.1088],
         ...,
         [ 1.3302,  0.3248,  0.0718,  ...,  0.8641,  0.3007,  0.7843],
         [ 1.8229, -0.9073,  0.1750,  ...,  1.4255, -0.6848,  0.7065],
         [ 1.5671, -0.7720,  0.3958,  ..., -0.7300, -0.4401,  0.6746]],

        [[ 1.3799, -0.1801,  0.2591,  ...,  0.4654, -0.0916,  0.5849],
         [ 0.4687, -0.5094, -0.7868,  ...,  0.4853,  0.1096,  0.3783],
         [ 1.0376, -0.0834, -0.6569,  ...,  1.2985, -0.7414,  0.2643],
         ...,
         [ 1.1771,  0.5741,  0.0878,  ...,  0.9046, -0.1206,  0.1959],
         [ 1.1198, -0.8541, -0.8202,  ...,  1.3653, -0.6109,  0.5063],
         [ 1.6488, -0.5187,  0.8358,  ...,  0.1345, -0.4083,  0.6425]],

        [[ 0.7943, -0.4391, -0.2170,  ...,  0

In [20]:
class MODataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = {key: val for key, val in encodings.items() if key != 'labels'}
        self.labels = encodings['labels']

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MODataset(example_sentence_inputs)
train_dataset

<__main__.MODataset at 0x1abe9809f70>

In [24]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=256,  # batch size per device during training
    per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [31]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…





TrainOutput(global_step=3, training_loss=3.4128875732421875)

Testing some classification model
====================

In [None]:
from transformers import AdamW
# From paper:
# lr: 1e-4
# Beta1 = 0.9 (default)
# Beta2 = 0.999 (default)
# L2 weight decay = 0.01

# Longer sequences are disproportionately expensive
# because attention is quadratic to the sequence
# length. To speed up pretraing in our experiments,
# we pre-train the model with sequence length of
# 128 for 90% of the steps. Then, we train the rest
# 10% of the steps of sequence of 512 to learn the
# positional embeddings.



#Batch size 256 for 1e6 steps


optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

In [None]:
sentences[0]

In [None]:
#doc = nlp("That's a lot better. He was finally walking to the beaches. There he had a meeting with his father. Afterwards, he read a book. The fishing rod that he used was really old")
doc = nlp("When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.")

for token in doc:
    print(token.text, token.pos_, token.lemma_)

In [None]:
doc = nlp("San Francisco is a long drive away from here. Ah, I forgot what I was doing. He had to get a new pair of shoes.")

for token in doc:
    print(token.text, token.pos_, token.lemma_)