In [64]:
from transformers import BertModel, BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

import torch
import numpy as np
import spacy
import re

In [2]:
sentences = ["When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.",
 'These facts seemed to me to throw some light on the origin of species--that mystery of mysteries, as it has been called by one of our greatest philosophers.',
 'On my return home, it occurred to me, in 1837, that something might perhaps be made out on this question by patiently accumulating and reflecting on all sorts of facts which could possibly have any bearing on it.',
 "After five years' work I allowed myself to speculate on the subject, and drew up some short notes; these I enlarged in 1844 into a sketch of the conclusions, which then seemed to me probable: from that period to the present day I have steadily pursued the same object.",
 'I hope that I may be excused for entering on these personal details, as I give them to show that I have not been hasty in coming to a decision.']

In [5]:
def whole_word_pos_tokenization_and_masking(sequence: str, nlp_model=None, posoi="VERB"):
        """
        posoi: Part-Of-Speech of interest
        
        Performs whole-word-masking based on selected posoi.
        
        POS possibilities:['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                             
        TODO: What if no tokens are masked?
        
        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        nlp = spacy.load("en_core_web_sm")
        spacy_sentence = nlp(sequence)
        posoi_vocab = [token.text for token in spacy_sentence if token.pos_ == posoi]
        input_ids = tokenizer.encode(sequence, add_special_tokens=False)
        input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        
        mask_indices = []
        composite_word_indices = []
        composite_word_tokens = []
        for (i, token) in enumerate(input_tokens):
            if token == "[CLS]" or token == "[SEP]":
                continue
            elif token.startswith("##"):
                composite_word_indices.append(i)
                composite_word_tokens.append(token)
                if "".join([x.strip("##") for x in composite_word_tokens]) in posoi_vocab:
                    mask_indices = mask_indices + composite_word_indices
                    
            elif token in posoi_vocab:
                mask_indices.append(i)
            else:
                composite_word_indices = [i]
                composite_word_tokens = [token]
                
        mask_labels = [1 if i in mask_indices else 0 for i in range(len(input_tokens))]
        masked_tokens = [x if mask_labels[i] == 0 else 103 for i,x in enumerate(input_ids)]
        masked_input = tokenizer.decode(masked_tokens)
        print(sequence)
        print(masked_input)
        
        inputs = tokenizer(masked_input, return_tensors="pt")
        inputs['labels'] = tokenizer.encode(sequence, return_tensors="pt")
        return inputs



In [6]:
example_sentence_inputs = whole_word_pos_tokenization_and_masking("When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.")
example_sentence_inputs

When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.
when on board h. m. s.'beagle,'as naturalist, i was much [MASK] with certain facts in the distribution of the inhabitants of south america, and in the geological relations of the present to the past inhabitants of that continent.


{'input_ids': tensor([[  101,  2043,  2006,  2604,  1044,  1012,  1049,  1012,  1055,  1012,
          1005, 26892,  9354,  1010,  1005,  2004, 19176,  1010,  1045,  2001,
          2172,   103,  2007,  3056,  8866,  1999,  1996,  4353,  1997,  1996,
          4864,  1997,  2148,  2637,  1010,  1998,  1999,  1996,  9843,  4262,
          1997,  1996,  2556,  2000,  1996,  2627,  4864,  1997,  2008,  9983,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]]), 'labels': tensor([[  101,  2043,  2006,  2604,  1044,  1012,  1049,  1012,  1055,  1012,
          1005, 26892,  9354,  1010,  1005,  2004, 19176,  1010,  10

In [7]:
model = BertForMaskedLM(config=BertConfig())
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

In [8]:
outputs = model(**example_sentence_inputs, return_dict=True)

In [9]:
outputs

MaskedLMOutput(loss=tensor(10.4737, grad_fn=<NllLossBackward>), logits=tensor([[[ 0.1085, -0.5772,  0.1293,  ...,  1.0368,  0.5574,  0.9810],
         [-0.2183,  0.2934, -1.4217,  ...,  1.0051,  0.2181,  0.2821],
         [ 0.1343, -0.0310, -0.2285,  ...,  0.5672,  0.5362,  0.3733],
         ...,
         [-0.0724,  0.0114, -0.5355,  ...,  1.0788,  0.3760,  1.0045],
         [ 0.4110, -0.6894, -0.2067,  ...,  0.6501,  0.1130, -0.1410],
         [ 0.1350,  0.8632, -0.0531,  ...,  0.8801, -0.3061,  0.2938]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [10]:
loss = outputs.loss

In [11]:
loss.backward()

In [12]:
optimizer.step()

In [13]:
model(**example_sentence_inputs, return_dict=True)

MaskedLMOutput(loss=tensor(10.0098, grad_fn=<NllLossBackward>), logits=tensor([[[ 0.3021,  0.2153,  0.3544,  ...,  0.4808,  0.4257,  0.3227],
         [ 0.3361,  0.2841, -1.6514,  ...,  0.7961,  0.1002,  0.5828],
         [-0.6459, -0.2280, -0.3719,  ...,  0.2865,  0.4029,  0.3031],
         ...,
         [ 0.2780,  0.7134, -0.7532,  ...,  1.0657,  0.1708,  0.6355],
         [-0.3745,  0.1551, -0.5077,  ...,  0.7912,  0.3046, -0.2200],
         [-0.8340, -0.1227, -0.5316,  ...,  0.2119,  0.2569,  0.1668]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [14]:


class MODataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = {key: val for key, val in encodings.items() if key != 'labels'}
        self.labels = encodings['labels']

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MODataset(example_sentence_inputs)
train_dataset

<__main__.MODataset at 0x1b3cf5284f0>

In [15]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=256,  # batch size per device during training
    per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [16]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…





TrainOutput(global_step=3, training_loss=9.672528584798178)

In [87]:
def whole_word_MO_tokenization_and_masking(sequence: str, nlp_model=None):
        """
        posoi: Part-Of-Speech of interest
        
        Performs whole-word-masking based on selected posoi.
        
        POS possibilities:['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                             
        TODO: What if no tokens are masked?
        
        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        nlp = spacy.load("en_core_web_sm")
        # Disable some parts for efficiency
        spacy_sentence = nlp(sequence, disable=["parser"])
        
        POS_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        NER_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 
                    'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
        NER_pairs = ['']
        
        input_ids = tokenizer.encode(sequence, add_special_tokens=False)
        input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        
        sequence_pos_list = [token.pos_ for token in spacy_sentence]
        sequence_pos_frequency = {pos: sequence_pos_list.count(pos) for pos in sequence_pos_list}
        
        modified_input_list = []
        
        #POS-masking
        for posoi in sequence_pos_frequency.keys():
            posoi_vocab = [token.text for token in spacy_sentence if token.pos_ == posoi]
            
            mask_indices = []
            composite_word_indices = []
            composite_word_tokens = []
            for (i, token) in enumerate(input_tokens):
                if token == "[CLS]" or token == "[SEP]":
                    continue
                elif token.startswith("##"):
                    composite_word_indices.append(i)
                    composite_word_tokens.append(token)
                    if "".join([x.strip("##") for x in composite_word_tokens]) in posoi_vocab:
                        mask_indices = mask_indices + composite_word_indices

                elif token in posoi_vocab:
                    mask_indices.append(i)
                else:
                    composite_word_indices = [i]
                    composite_word_tokens = [token]

            mask_labels = [1 if i in mask_indices else 0 for i in range(len(input_tokens))]
            masked_tokens = [x if mask_labels[i] == 0 else 103 for i,x in enumerate(input_ids)]
            masked_input = tokenizer.decode(masked_tokens)
            
            modified_input_list.append((posoi, masked_input))
        
        #POS-based lemmatization
        replacement_tuples = [(token.text, token.lemma_) for token in spacy_sentence if token.text.lower() != token.lemma_]
        print(replacement_tuples)
        replaced_sentence = sequence
        for replacement in replacement_tuples:
            replaced_sentence = re.sub(r'\b' + replacement[0] + r'\b', replacement[1], replaced_sentence, flags=re.IGNORECASE)
        replaced_sentence = replaced_sentence.replace("  ", " ")
        modified_input_list.append(('Lemma', replaced_sentence))
        
        #NER-based swapping of time-place (if present)
        
        
        
        for mask in modified_input_list:
            print(mask)
        print(sequence)
        inputs = tokenizer(modified_input_list, return_tensors="pt", padding=True)
        
        
        inputs['labels'] = tokenizer.encode(sequence, return_tensors="pt")
        return inputs
    
def lemmatize_sequence(sequence, posoi='VERB'):
    """
    TODO add probability?
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sequence)
    replacement_tuples = [(token.text, token.lemma_) for token in doc if token.text.lower() != token.lemma_ and token.pos_ == posoi]
    replaced_sentence = sequence
    for replacement in replacement_tuples:
        replaced_sentence = replaced_sentence.replace(replacement[0], " " + replacement[1])
    replaced_sentence = replaced_sentence.replace("  ", " ")
    return sequence, replaced_sentence
    

In [89]:
whole_word_MO_tokenization_and_masking("Anne went to the Albert Heijn to buy some milk. After that, me and my buddy went to the bathhouse.")

[('Anne', 'Anne'), ('went', 'go'), ('Albert', 'Albert'), ('Heijn', 'Heijn'), ('me', 'I'), ('went', 'go')]
('PROPN', '[MASK] went to the [MASK] [MASK] [MASK] [MASK] to buy some milk. After that, me and my buddy went to the bathhouse.')
('VERB', 'Anne [MASK] to the Albert Heijn to [MASK] some milk. After that, me and my buddy [MASK] to the bathhouse.')
('ADP', 'Anne went [MASK] the Albert Heijn [MASK] buy some milk. [MASK] that, me and my buddy went [MASK] the bathhouse.')
('DET', 'Anne went to [MASK] Albert Heijn to buy [MASK] milk. After [MASK], me and my buddy went to [MASK] bathhouse.')
('PART', 'Anne went [MASK] the Albert Heijn [MASK] buy some milk. After that, me and my buddy went [MASK] the bathhouse.')
('NOUN', 'Anne went to the Albert Heijn to buy some [MASK]. After that, me and my [MASK] went to the [MASK] [MASK].')
('PUNCT', 'Anne went to the Albert Heijn to buy some milk [MASK] After that [MASK] me and my buddy went to the bathhouse [MASK]')
('PRON', 'Anne went to the Albert

{'input_ids': tensor([[  101, 11629, 17195,  2249,   102,   103,  1355,  1106,  1103,   103,
           103,   103,   103,  1106,  4417,  1199,  6831,   119,  1258,  1115,
           117,  1143,  1105,  1139, 16723,  1355,  1106,  1103, 10919,  3255,
           119,   102],
        [  101,   159,  9637,  2064,   102,  3967,   103,  1106,  1103,  3986,
          1124,  1182, 22923,  1106,   103,  1199,  6831,   119,  1258,  1115,
           117,  1143,  1105,  1139, 16723,   103,  1106,  1103, 10919,  3255,
           119,   102],
        [  101,  5844,  2101,   102,  3967,  1355,   103,  1103,  3986,  1124,
          1182, 22923,   103,  4417,  1199,  6831,   119,   103,  1115,   117,
          1143,  1105,  1139, 16723,  1355,   103,  1103, 10919,  3255,   119,
           102,     0],
        [  101, 18581,  1942,   102,  3967,  1355,  1106,   103,  3986,  1124,
          1182, 22923,  1106,  4417,   103,  6831,   119,  1258,   103,   117,
          1143,  1105,  1139, 16723,  1355,  

In [130]:
whole_word_MO_tokenization_and_masking("The primary objective of the German forces was to compel Britain to agree to a negotiated peace settlement. In July 1940, the air and sea blockade began, with the Luftwaffe mainly targeting coastal-shipping convoys, as well as ports and shipping centres such as Portsmouth. On 1 August, the Luftwaffe was directed to achieve air superiority over the RAF, with the aim of incapacitating RAF Fighter Command; 12 days later, it shifted the attacks to RAF airfields and infrastructure.")

[('forces', 'force'), ('was', 'be'), ('Britain', 'Britain'), ('negotiated', 'negotiate'), ('July', 'July'), ('began', 'begin'), ('Luftwaffe', 'Luftwaffe'), ('targeting', 'target'), ('convoys', 'convoy'), ('ports', 'port'), ('centres', 'centre'), ('Portsmouth', 'Portsmouth'), ('August', 'August'), ('Luftwaffe', 'Luftwaffe'), ('was', 'be'), ('directed', 'direct'), ('RAF', 'RAF'), ('incapacitating', 'incapacitate'), ('RAF', 'RAF'), ('Fighter', 'Fighter'), ('Command', 'Command'), ('days', 'day'), ('shifted', 'shift'), ('attacks', 'attack'), ('RAF', 'RAF'), ('airfields', 'airfield')]
('DET', '[MASK] primary objective of [MASK] German forces was to compel Britain to agree to [MASK] negotiated peace settlement. In July 1940, [MASK] air and sea blockade began, with [MASK] Luftwaffe mainly targeting coastal - shipping convoys, as well as ports and shipping centres such as Portsmouth. On 1 August, [MASK] Luftwaffe was directed to achieve air superiority over [MASK] RAF, with [MASK] aim of incapa

{'input_ids': tensor([[  101, 18581,  1942,  ...,   119,   102,     0],
        [  101,  5844,  4538,  ...,   119,   102,     0],
        [  101, 24819, 27370,  ...,   119,   102,     0],
        ...,
        [  101,  5844,  2559,  ...,   119,   102,     0],
        [  101, 11629, 11414,  ...,   119,   102,     0],
        [  101,  3180, 12917,  ...,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        ...,
        [0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 0, 0]]), 'labels': tensor([[  101,  1109,  2425,  7649,  1104,  1103,  1528,  2088,  1108,  1106,
          3254, 10522,  2855,  1106,  5340,  1106,   170, 14

Spacy testing
===========

In [200]:
nlp = spacy.load("en_core_web_sm")
#en_core_web_trf doesnt work in spacy 3.0
spacy_sentence = nlp("Apple is looking at aggresively buying U.K. startup for $1.2 billion. They walked 5 km")
test_pos_list = []
for token in spacy_sentence:
    test_pos_list.append(token.pos_)
    print(token, token.pos_, token.lemma_, token.text.lower())

Apple PROPN Apple apple
is AUX be is
looking VERB look looking
at ADP at at
aggresively ADV aggresively aggresively
buying VERB buy buying
U.K. PROPN U.K. u.k.
startup NOUN startup startup
for ADP for for
$ SYM $ $
1.2 NUM 1.2 1.2
billion NUM billion billion
. PUNCT . .
They PRON they they
walked VERB walk walked
5 NUM 5 5
km NOUN km km


In [206]:
[(x, x+len(['ADV', 'VERB'])) for x in range(len(test_pos_list)) if test_pos_list[x:x+len(['ADV', 'VERB'])] == ['ADV', 'VERB']]

[(4, 6)]

In [196]:
sequence_pos_list = [token.pos_ for token in spacy_sentence]
sequence_pos_frequency = {pos: sequence_pos_list.count(pos) for pos in sequence_pos_list}
sequence_pos_frequency

{'PROPN': 2,
 'AUX': 1,
 'VERB': 3,
 'ADP': 2,
 'ADV': 1,
 'NOUN': 2,
 'SYM': 1,
 'NUM': 3,
 'PUNCT': 1,
 'PRON': 1}

In [197]:
for ent in spacy_sentence.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 39 43 GPE
$1.2 billion 56 68 MONEY
5 82 83 CARDINAL


In [198]:
time = spacy_sentence.text[44:56].split(" ")
time.reverse()
' '.join(time)

' for startup'

In [199]:
spacy_sentence.cats

{}

In [79]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1b3cc0d8d10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1b40b7e5a90>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1b3cbd48d00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1b3cbc0be20>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1b414456ac0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1b513751ec0>)]

Testing some classification model
====================

In [16]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [17]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_batch = ["I love Pixar.", "I don't care for Pixar."]
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [18]:
import torch
labels = torch.tensor([1,0]).unsqueeze(0)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
loss.backward()
optimizer.step()

NameError: name 'optimizer' is not defined

In [None]:
loss

In [None]:
BertConfig()

In [None]:
from transformers import AdamW
# From paper:
# lr: 1e-4
# Beta1 = 0.9 (default)
# Beta2 = 0.999 (default)
# L2 weight decay = 0.01

# Longer sequences are disproportionately expensive
# because attention is quadratic to the sequence
# length. To speed up pretraing in our experiments,
# we pre-train the model with sequence length of
# 128 for 90% of the steps. Then, we train the rest
# 10% of the steps of sequence of 512 to learn the
# positional embeddings.



#Batch size 256 for 1e6 steps


optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

In [None]:
model = BertForPreTraining(BertConfig())
model.train()

In [88]:
pos_string = """ADJ: adjective
ADP: adposition
ADV: adverb
AUX: auxiliary
CCONJ: coordinating conjunction
DET: determiner
INTJ: interjection
NOUN: noun
NUM: numeral
PART: particle
PRON: pronoun
PROPN: proper noun
PUNCT: punctuation
SCONJ: subordinating conjunction
SYM: symbol
VERB: verb
X: other"""

In [92]:
print([x.split(":", 1)[0] for x in pos_string.split("\n")])

['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


In [93]:
pos_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

In [126]:
for token in doc:
    print(token, token.pos_, token.lemma_)

When ADV when
on ADP on
board NOUN board
H.M.S. PROPN H.M.S.
' PUNCT '
Beagle PROPN Beagle
, PUNCT ,
' PUNCT '
as ADP as
naturalist ADJ naturalist
, PUNCT ,
I PRON I
was AUX be
much ADV much
struck VERB strike
with ADP with
certain ADJ certain
facts NOUN fact
in ADP in
the DET the
distribution NOUN distribution
of ADP of
the DET the
inhabitants NOUN inhabitant
of ADP of
South PROPN South
America PROPN America
, PUNCT ,
and CCONJ and
in ADP in
the DET the
geological ADJ geological
relations NOUN relation
of ADP of
the DET the
present NOUN present
to ADP to
the DET the
past ADJ past
inhabitants NOUN inhabitant
of ADP of
that DET that
continent NOUN continent
. PUNCT .


In [136]:
sentences[0]

"When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent."

In [141]:
replacement_tuples = [(token.text, token.lemma_) for token in doc if token.pos_ == 'VERB']
for replacement in replacement_tuples:
    replaced_sentence = sentences[0].replace(replacement[0], replacement[1])
replaced_sentence

"When on board H.M.S. 'Beagle,' as naturalist, I was much strike with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent."

In [183]:
def lemmatize_sequence(sequence, posoi='VERB'):
    """
    TODO add probability?
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sequence)
    replacement_tuples = [(token.text, token.lemma_) for token in doc if token.text.lower() != token.lemma_ and token.pos_ == posoi]
    replaced_sentence = sequence
    for replacement in replacement_tuples:
        replaced_sentence = replaced_sentence.replace(replacement[0], " " + replacement[1])
    replaced_sentence = replaced_sentence.replace("  ", " ")
    return sequence, replaced_sentence
    

In [84]:
lemmatize_sequence("Anne went to the Albert Heijn to buy some milk. After that, me and my buddy went to the bathhouse")

('Anne went to the Albert Heijn to buy some milk. After that, me and my buddy went to the bathhouse',
 'Anne go to the Albert Heijn to buy some milk. After that, me and my buddy go to the bathhouse')

In [186]:
doc = nlp("That's a lot better. He was finally walking to the beaches. There he had a meeting with his father. Afterwards, he read a book. The fishing rod that he used was really old")

for token in doc:
    print(token.text, token.pos_, token.lemma_)

That DET that
's AUX be
a DET a
lot NOUN lot
better ADJ well
. PUNCT .
He PRON he
was AUX be
finally ADV finally
walking VERB walk
to ADP to
the DET the
beaches NOUN beach
. PUNCT .
There ADV there
he PRON he
had VERB have
a DET a
meeting NOUN meeting
with ADP with
his PRON his
father NOUN father
. PUNCT .
Afterwards ADV afterwards
, PUNCT ,
he PRON he
read VERB read
a DET a
book NOUN book
. PUNCT .
The DET the
fishing NOUN fishing
rod NOUN rod
that DET that
he PRON he
used VERB use
was AUX be
really ADV really
old ADJ old


In [191]:
doc = nlp("San Francisco is a long drive away from here. Ah, I forgot what I was doing. He had to get a new pair of shoes.")

for token in doc:
    print(token.text, token.pos_, token.lemma_)

San PROPN San
Francisco PROPN Francisco
is AUX be
a DET a
long ADJ long
drive NOUN drive
away ADV away
from ADP from
here ADV here
. PUNCT .
Ah INTJ ah
, PUNCT ,
I PRON I
forgot VERB forget
what PRON what
I PRON I
was AUX be
doing VERB do
. PUNCT .
He PRON he
had VERB have
to PART to
get VERB get
a DET a
new ADJ new
pair NOUN pair
of ADP of
shoes NOUN shoe
. PUNCT .


In [93]:
NER_text = """PERSON	People, including fictional.
NORP	Nationalities or religious or political groups.
FAC	Buildings, airports, highways, bridges, etc.
ORG	Companies, agencies, institutions, etc.
GPE	Countries, cities, states.
LOC	Non-GPE locations, mountain ranges, bodies of water.
PRODUCT	Objects, vehicles, foods, etc. (Not services.)
EVENT	Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART	Titles of books, songs, etc.
LAW	Named documents made into laws.
LANGUAGE	Any named language.
DATE	Absolute or relative dates or periods.
TIME	Times smaller than a day.
PERCENT	Percentage, including ”%“.
MONEY	Monetary values, including unit.
QUANTITY	Measurements, as of weight or distance.
ORDINAL	“first”, “second”, etc.
CARDINAL	Numerals that do not fall under another type."""
NER_text

'PERSON\tPeople, including fictional.\nNORP\tNationalities or religious or political groups.\nFAC\tBuildings, airports, highways, bridges, etc.\nORG\tCompanies, agencies, institutions, etc.\nGPE\tCountries, cities, states.\nLOC\tNon-GPE locations, mountain ranges, bodies of water.\nPRODUCT\tObjects, vehicles, foods, etc. (Not services.)\nEVENT\tNamed hurricanes, battles, wars, sports events, etc.\nWORK_OF_ART\tTitles of books, songs, etc.\nLAW\tNamed documents made into laws.\nLANGUAGE\tAny named language.\nDATE\tAbsolute or relative dates or periods.\nTIME\tTimes smaller than a day.\nPERCENT\tPercentage, including ”%“.\nMONEY\tMonetary values, including unit.\nQUANTITY\tMeasurements, as of weight or distance.\nORDINAL\t“first”, “second”, etc.\nCARDINAL\tNumerals that do not fall under another type.'

In [99]:
print([line.split("\t")[0] for line in NER_text.split("\n")])

['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']


Building custom transformer based models for pre-training
====================

In [119]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [None]:
from transformers.file_utils import ModelOutput
from transformers 

In [None]:
class BertPreTrainingHeads(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.predictions = BertLMPredictionHead(config)
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

In [84]:
for token in doc:
    print(token, token.pos_, token.lemma_)

When ADV when
on ADP on
board NOUN board
H.M.S. PROPN H.M.S.
' PUNCT '
Beagle PROPN Beagle
, PUNCT ,
' PUNCT '
as ADP as
naturalist ADJ naturalist
, PUNCT ,
I PRON I
was AUX be
much ADV much
struck VERB strike
with ADP with
certain ADJ certain
facts NOUN fact
in ADP in
the DET the
distribution NOUN distribution
of ADP of
the DET the
inhabitants NOUN inhabitant
of ADP of
South PROPN South
America PROPN America
, PUNCT ,
and CCONJ and
in ADP in
the DET the
geological ADJ geological
relations NOUN relation
of ADP of
the DET the
present NOUN present
to ADP to
the DET the
past ADJ past
inhabitants NOUN inhabitant
of ADP of
that DET that
continent NOUN continent
. PUNCT .


In [None]:
class BertForPreTraining(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config)
        self.cls = BertPreTrainingHeads(config)

        self.init_weights()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

[DOCS]    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        next_sentence_label=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.

        Returns:

        Example::

            >>> from transformers import BertTokenizer, BertForPreTraining
            >>> import torch

            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')

            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
            >>> outputs = model(**inputs)

            >>> prediction_logits = outputs.prediction_logits
            >>> seq_relationship_logits = outputs.seq_relationship_logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return BertForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [65]:
dir(doc[2])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [86]:
lemmatized_doc = [token if token.pos_ != 'VERB' else token.lemma_ for token in doc]
lemmatized_doc

[When,
 on,
 board,
 H.M.S.,
 ',
 Beagle,
 ,,
 ',
 as,
 naturalist,
 ,,
 I,
 was,
 much,
 'strike',
 with,
 certain,
 facts,
 in,
 the,
 distribution,
 of,
 the,
 inhabitants,
 of,
 South,
 America,
 ,,
 and,
 in,
 the,
 geological,
 relations,
 of,
 the,
 present,
 to,
 the,
 past,
 inhabitants,
 of,
 that,
 continent,
 .]

In [None]:
class BertForMOPreTrainingOutput(ModelOutput):
    """
    Output type of :class:`~transformers.BertForPreTraining`.

    Args:
        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
            sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    prediction_logits: torch.FloatTensor = None
    seq_relationship_logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [None]:
class TFBertPreTrainingLoss:
    """
    Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
    NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
    computation.
    """

    def compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
        )
        # make sure only labels that are not equal to -100
        # are taken into account as loss
        masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
        masked_lm_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
            mask=masked_lm_active_loss,
        )
        masked_lm_labels = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
        )
        next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
        next_sentence_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
        )
        next_sentence_label = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
        )
        masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
        next_sentence_loss = loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits)
        masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0]))
        masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)

        return masked_lm_loss + next_sentence_loss

In [None]:
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

[DOCS]    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )