In [2]:
from transformers import BertModel, BertConfig
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import AdamW
from transformers import Trainer, TrainingArguments

import torch
import numpy as np
import spacy
import re

from datetime import datetime

In [3]:
pos_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
ner_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

In [4]:
sentences = ["When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.",
 'These facts seemed to me to throw some light on the origin of species--that mystery of mysteries, as it has been called by one of our greatest philosophers.',
 'On my return home, it occurred to me, in 1837, that something might perhaps be made out on this question by patiently accumulating and reflecting on all sorts of facts which could possibly have any bearing on it.',
 "After five years' work I allowed myself to speculate on the subject, and drew up some short notes; these I enlarged in 1844 into a sketch of the conclusions, which then seemed to me probable: from that period to the present day I have steadily pursued the same object.",
 'I hope that I may be excused for entering on these personal details, as I give them to show that I have not been hasty in coming to a decision.']

In [5]:
def whole_word_pos_tokenization_and_masking(sequence: str, nlp_model=None, posoi="VERB"):
        """
        posoi: Part-Of-Speech of interest
        
        Performs whole-word-masking based on selected posoi.
        
        POS possibilities:['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                             
        TODO: What if no tokens are masked?
        
        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        nlp = spacy.load("en_core_web_sm")
        spacy_sentence = nlp(sequence)
        posoi_vocab = [token.text for token in spacy_sentence if token.pos_ == posoi]
        input_ids = tokenizer.encode(sequence, add_special_tokens=False)
        input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        
        mask_indices = []
        composite_word_indices = []
        composite_word_tokens = []
        for (i, token) in enumerate(input_tokens):
            if token == "[CLS]" or token == "[SEP]":
                continue
            elif token.startswith("##"):
                composite_word_indices.append(i)
                composite_word_tokens.append(token)
                if "".join([x.strip("##") for x in composite_word_tokens]) in posoi_vocab:
                    mask_indices = mask_indices + composite_word_indices
                    
            elif token in posoi_vocab:
                mask_indices.append(i)
            else:
                composite_word_indices = [i]
                composite_word_tokens = [token]
                
        mask_labels = [1 if i in mask_indices else 0 for i in range(len(input_tokens))]
        masked_tokens = [x if mask_labels[i] == 0 else 103 for i,x in enumerate(input_ids)]
        masked_input = tokenizer.decode(masked_tokens)
        print(sequence)
        print(masked_input)
        
        inputs = tokenizer(masked_input, return_tensors="pt")
        inputs['labels'] = tokenizer.encode(sequence, return_tensors="pt")
        return inputs



In [6]:
example_sentence_inputs = whole_word_pos_tokenization_and_masking("When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.")
example_sentence_inputs

When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.
when on board h. m. s.'beagle,'as naturalist, i was much [MASK] with certain facts in the distribution of the inhabitants of south america, and in the geological relations of the present to the past inhabitants of that continent.


{'input_ids': tensor([[  101,  2043,  2006,  2604,  1044,  1012,  1049,  1012,  1055,  1012,
          1005, 26892,  9354,  1010,  1005,  2004, 19176,  1010,  1045,  2001,
          2172,   103,  2007,  3056,  8866,  1999,  1996,  4353,  1997,  1996,
          4864,  1997,  2148,  2637,  1010,  1998,  1999,  1996,  9843,  4262,
          1997,  1996,  2556,  2000,  1996,  2627,  4864,  1997,  2008,  9983,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]]), 'labels': tensor([[  101,  2043,  2006,  2604,  1044,  1012,  1049,  1012,  1055,  1012,
          1005, 26892,  9354,  1010,  1005,  2004, 19176,  1010,  10

In [7]:
model = BertForMaskedLM(config=BertConfig())
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

In [8]:
outputs = model(**example_sentence_inputs, return_dict=True)

In [9]:
outputs

MaskedLMOutput(loss=tensor(10.4704, grad_fn=<NllLossBackward>), logits=tensor([[[-0.3646,  0.5917, -0.1603,  ..., -0.7111, -0.0527,  0.4167],
         [-1.0825,  0.4580, -0.3780,  ..., -0.4931, -0.5675,  0.1373],
         [-0.1531,  0.0095, -0.2495,  ..., -0.3558, -0.2613,  0.1726],
         ...,
         [-0.5535,  0.4676,  0.1559,  ..., -0.1072, -0.4580, -0.3126],
         [ 0.1005,  1.0893,  0.4038,  ...,  0.1370, -0.3725,  0.0565],
         [-0.0343,  0.5544, -0.1182,  ..., -0.7318, -0.2411,  0.0796]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [10]:
loss = outputs.loss

In [11]:
loss.backward()

In [12]:
optimizer.step()

In [13]:
model(**example_sentence_inputs, return_dict=True)

MaskedLMOutput(loss=tensor(10.0311, grad_fn=<NllLossBackward>), logits=tensor([[[-0.0676,  0.5329,  0.1456,  ..., -0.5788,  0.4360,  0.5184],
         [-0.5390,  0.2936,  0.0171,  ..., -0.5895, -0.6560, -0.1490],
         [-0.6003,  0.3584,  0.0722,  ..., -0.1600, -0.6146,  0.0249],
         ...,
         [-0.2442,  0.7161, -0.2941,  ..., -0.3834, -0.2195,  0.4626],
         [-0.4728,  0.8677,  0.8608,  ...,  0.4586, -0.1241,  0.6362],
         [-0.7090,  0.9102, -0.5073,  ...,  0.2590, -0.7039,  0.1553]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [14]:


class MODataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = {key: val for key, val in encodings.items() if key != 'labels'}
        self.labels = encodings['labels']

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MODataset(example_sentence_inputs)
train_dataset

<__main__.MODataset at 0x224c7212160>

In [15]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=256,  # batch size per device during training
    per_device_eval_batch_size=256,   # batch size for evaluation
    learning_rate=1e-5,     
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=None            # evaluation dataset
)

In [16]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1.0, style=ProgressStyle(description_widt…





TrainOutput(global_step=3, training_loss=9.627637227376303)

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
nlp = spacy.load("en_core_web_sm")

In [18]:
def whole_word_MO_tokenization_and_masking(tokenizer, nlp_model, sequence: str):
        """
        posoi: Part-Of-Speech of interest
        
        Performs whole-word-masking based on selected posoi.
        
        POS possibilities:['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                             
        TODO: What if no tokens are masked?
        
        """
        print('loading:', datetime.now().time())
        spacy_sentence = nlp_model(sequence, disable=["parser"])
        
        POS_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 
                            'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        NER_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 
                    'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
        NER_pairs = ['']
        
        input_ids = tokenizer.encode(sequence, add_special_tokens=False)
        input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        print(input_tokens)
        sequence_pos_list = [token.pos_ for token in spacy_sentence]
        print()
        sequence_pos_frequency = {pos: sequence_pos_list.count(pos) for pos in sequence_pos_list}
        
        modified_input_list = []
        
        #POS-masking
        print('pos-start:', datetime.now().time())
        for posoi in sequence_pos_frequency.keys():
            posoi_vocab = [token.text for token in spacy_sentence if token.pos_ == posoi]
            
            mask_indices = []
            composite_word_indices = []
            composite_word_tokens = []
            for (i, token) in enumerate(input_tokens):
                if token == "[CLS]" or token == "[SEP]":
                    continue
                elif token.startswith("##"):
                    composite_word_indices.append(i)
                    composite_word_tokens.append(token)
                    if "".join([x.strip("##") for x in composite_word_tokens]) in posoi_vocab:
                        mask_indices = mask_indices + composite_word_indices

                elif token in posoi_vocab:
                    mask_indices.append(i)
                else:
                    composite_word_indices = [i]
                    composite_word_tokens = [token]

            mask_labels = [1 if i in mask_indices else 0 for i in range(len(input_tokens))]
            masked_tokens = [x if mask_labels[i] == 0 else 103 for i,x in enumerate(input_ids)]
            masked_input = tokenizer.decode(masked_tokens)
        
            modified_input_list.append((posoi, masked_input))
        
        print('lemma-start:', datetime.now().time())
        #POS-based lemmatization
        replacement_tuples = [(token.text, token.lemma_) for token in spacy_sentence if token.text.lower() != token.lemma_]
        pos_replaced_sentence = sequence
        for replacement in replacement_tuples:
            pos_replaced_sentence = re.sub(r'\b' + replacement[0] + r'\b', replacement[1], pos_replaced_sentence)

        pos_replaced_sentence = pos_replaced_sentence.replace("  ", " ")
        modified_input_list.append(('Lemma', pos_replaced_sentence))
        
        #NER-based swapping of time-place (if present)
        print('ner-start:', datetime.now().time())
        ner_swapped_sentence = spacy_sentence.text
        for ent in spacy_sentence.ents:
            if ent.label_ == 'TIME':
                time_substring = ner_swapped_sentence[ent.start_char:ent.end_char].split(" ")
                time_substring.reverse()
                ner_swapped_sentence = ner_swapped_sentence.replace(ner_swapped_sentence[ent.start_char:ent.end_char], " ".join(time_substring))
                
        modified_input_list.append(('NER', ner_swapped_sentence))
        
        
        #TODO future ideas
        
        #Show all resulting sequences    
        for mask in modified_input_list:
            print(mask)            
        print(sequence)
        inputs = tokenizer(modified_input_list, return_tensors="pt", padding=True)
        
        
        inputs['labels'] = tokenizer.encode(sequence, return_tensors="pt")
        return inputs

In [33]:
print(datetime.now().time())
test_sentence = "Anne went to the Albert Heijn at 5 o'clock to buy some milk for me."
whole_word_MO_tokenization_and_masking(tokenizer=tokenizer, nlp_model=nlp, sequence=test_sentence)
print(datetime.now().time())

12:55:18.417153
loading: 12:55:18.417153
['Anne', 'went', 'to', 'the', 'Albert', 'He', '##i', '##jn', 'at', '5', 'o', "'", 'clock', 'to', 'buy', 'some', 'milk', 'for', 'me', '.']

pos-start: 12:55:18.426132
lemma-start: 12:55:18.427129
ner-start: 12:55:18.427129
('PROPN', "[MASK] went to the [MASK] [MASK] [MASK] [MASK] at 5 o'clock to buy some milk for me.")
('VERB', "Anne [MASK] to the Albert Heijn at 5 o'clock to [MASK] some milk for me.")
('ADP', "Anne went [MASK] the Albert Heijn [MASK] 5 o'clock [MASK] buy some milk [MASK] me.")
('DET', "Anne went to [MASK] Albert Heijn at 5 o'clock to buy [MASK] milk for me.")
('NUM', "Anne went to the Albert Heijn at [MASK] o'clock to buy some milk for me.")
('NOUN', "Anne went to the Albert Heijn at 5 o'clock to buy some [MASK] for me.")
('PART', "Anne went [MASK] the Albert Heijn at 5 o'clock [MASK] buy some milk for me.")
('PRON', "Anne went to the Albert Heijn at 5 o'clock to buy some milk for [MASK].")
('PUNCT', "Anne went to the Albert Hei

In [20]:
whole_word_MO_tokenization_and_masking(tokenizer=tokenizer, nlp_model=nlp, "The primary objective of the German forces was to compel Britain to agree to a negotiated peace settlement. In July 1940, the air and sea blockade began, with the Luftwaffe mainly targeting coastal-shipping convoys, as well as ports and shipping centres such as Portsmouth. On 1 August, the Luftwaffe was directed to achieve air superiority over the RAF, with the aim of incapacitating RAF Fighter Command; 12 days later, it shifted the attacks to RAF airfields and infrastructure.")

TypeError: whole_word_MO_tokenization_and_masking() missing 2 required positional arguments: 'nlp_model' and 'sequence'

Spacy testing
===========

In [43]:
nlp = spacy.load("en_core_web_sm")
#en_core_web_trf doesnt work in spacy 3.0
#spacy_sentence = nlp("Apple is looking at aggresively buying U.K. startup for $1.2 billion. They walked 5 km")
spacy_sentence = nlp("It's in Best")
test_pos_list = []
for token in spacy_sentence:
    test_pos_list.append(token.pos_)
    print((token, token.pos_, token.lemma_))

(It, 'PRON', 'it')
('s, 'AUX', 'be')
(in, 'ADP', 'in')
(Best, 'PROPN', 'Best')


In [29]:
sequence_pos_list = [token.pos_ for token in spacy_sentence]
print(sequence_pos_list)
sequence_pos_frequency = {pos: sequence_pos_list.count(pos) for pos in sequence_pos_list}
sequence_pos_frequency

['PROPN', 'VERB', 'ADP', 'DET', 'PROPN', 'PROPN', 'ADP', 'NUM', 'NOUN', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'PRON', 'PUNCT']


{'PROPN': 3,
 'VERB': 2,
 'ADP': 3,
 'DET': 2,
 'NUM': 1,
 'NOUN': 2,
 'PART': 1,
 'PRON': 1,
 'PUNCT': 1}

In [219]:
for ent in spacy_sentence.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Anne 0 4 PERSON
the Albert Heijn 13 29 ORG
5 o'clock 33 42 TIME


In [230]:
test_string = spacy_sentence.text
for ent in spacy_sentence.ents:
    if ent.label_ == 'TIME':
        time_substring = test_string[ent.start_char:ent.end_char].split(" ")
        time_substring.reverse()
        test_string = test_string.replace(test_string[ent.start_char:ent.end_char], " ".join(time_substring))
        
        
print(test_string)
        
        
#print(ent.text, ent.start_char, ent.end_char, ent.label_)

Anne went to the Albert Heijn at o'clock 5 to buy some milk. After that, me and my buddy went home.


In [227]:
test_string[ent.start_char:ent.end_char]

"5 o'clock"

In [220]:
time = spacy_sentence.text[44:56].split(" ")
time.reverse()
' '.join(time)

'm some buy o'

In [199]:
spacy_sentence.cats

{}

In [79]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1b3cc0d8d10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1b40b7e5a90>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1b3cbd48d00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1b3cbc0be20>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1b414456ac0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1b513751ec0>)]

Testing some classification model
====================

In [None]:
from transformers import AdamW
# From paper:
# lr: 1e-4
# Beta1 = 0.9 (default)
# Beta2 = 0.999 (default)
# L2 weight decay = 0.01

# Longer sequences are disproportionately expensive
# because attention is quadratic to the sequence
# length. To speed up pretraing in our experiments,
# we pre-train the model with sequence length of
# 128 for 90% of the steps. Then, we train the rest
# 10% of the steps of sequence of 512 to learn the
# positional embeddings.



#Batch size 256 for 1e6 steps


optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

In [136]:
sentences[0]

"When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent."

In [239]:
#doc = nlp("That's a lot better. He was finally walking to the beaches. There he had a meeting with his father. Afterwards, he read a book. The fishing rod that he used was really old")
doc = nlp("When on board H.M.S. 'Beagle,' as naturalist, I was much struck with certain facts in the distribution of the inhabitants of South America, and in the geological relations of the present to the past inhabitants of that continent.")

for token in doc:
    print(token.text, token.pos_, token.lemma_)

When ADV when
on ADP on
board NOUN board
H.M.S. PROPN H.M.S.
' PUNCT '
Beagle PROPN Beagle
, PUNCT ,
' PUNCT '
as ADP as
naturalist ADJ naturalist
, PUNCT ,
I PRON I
was AUX be
much ADV much
struck VERB strike
with ADP with
certain ADJ certain
facts NOUN fact
in ADP in
the DET the
distribution NOUN distribution
of ADP of
the DET the
inhabitants NOUN inhabitant
of ADP of
South PROPN South
America PROPN America
, PUNCT ,
and CCONJ and
in ADP in
the DET the
geological ADJ geological
relations NOUN relation
of ADP of
the DET the
present NOUN present
to ADP to
the DET the
past ADJ past
inhabitants NOUN inhabitant
of ADP of
that DET that
continent NOUN continent
. PUNCT .


In [191]:
doc = nlp("San Francisco is a long drive away from here. Ah, I forgot what I was doing. He had to get a new pair of shoes.")

for token in doc:
    print(token.text, token.pos_, token.lemma_)

San PROPN San
Francisco PROPN Francisco
is AUX be
a DET a
long ADJ long
drive NOUN drive
away ADV away
from ADP from
here ADV here
. PUNCT .
Ah INTJ ah
, PUNCT ,
I PRON I
forgot VERB forget
what PRON what
I PRON I
was AUX be
doing VERB do
. PUNCT .
He PRON he
had VERB have
to PART to
get VERB get
a DET a
new ADJ new
pair NOUN pair
of ADP of
shoes NOUN shoe
. PUNCT .
