In [1]:
from transformers import BertTokenizer, BertForPreTraining
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

In [3]:
import requests

In [4]:
data = requests.get('https://raw.githubusercontent.com/tenkara/nlp-transformers/master/14-transformer-fine-tuning/data/clean.txt')

In [5]:
text = data.text.split('\n')

In [7]:
text[:3]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

In [9]:
# create a list of sentences
bag = [sentence for para in text for sentence in para.split('.') if sentence != '']
bag_size = len(bag)
bag_size

1372

In [11]:
import random

# create a bag of correct and random sentences for training nsp
sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [sentence for sentence in paragraph.split('.') if sentence != '']

    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences - 2)
        # 50% of the time, sentence_b is the next sentence
        if random.random() >= 0.5:
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start + 1])
            label.append(0)
        else:
            # 50% of the time, sentence_b is a random sentence
            sentence_a.append(sentences[start])
            sentence_b.append(random.choice(sentences))
            label.append(1)



In [19]:
# tokenize the sentences
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, padding='max_length', truncation=True)

In [20]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [21]:
# add the next sentence label which is simply a binary label
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [22]:
# labels tensor is simply the input_ids tensor before masking
inputs['labels'] = inputs.input_ids.detach().clone()

In [23]:
# create random array of floats with equal dimensions to input_ids
random_array = torch.rand(inputs.input_ids.shape)
# mask random 15% of the input_ids
mask_array = (random_array < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [24]:
# take the masked tokens and replace them with 103
inputs.input_ids = torch.where(mask_array == 1, torch.ones_like(inputs.input_ids) * 103, inputs.input_ids)

In [25]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [26]:
inputs.input_ids[0]

tensor([  101,  1045,  5159,  1010,  2205,  1010,  2008,  2053,  2158,  2071,
         2412,  2228,  2008,   103,  2001, 26626,  2011,   103,  1010,  2030,
         2412,  6957,  2000,  2228,  2370,  1037,  2488,  2158,   102,  2002,
         2018,  2036,   103,  2396,  1997,  2108,   103,  1999,  2019,  5993,
          103,  2126,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [27]:
# Create a pytorch dataset from the data
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [28]:
# Initialize the dataset
dataset = MeditationsDataset(inputs)

In [29]:
# Initialize the dataloader, which we will use to load the data into the model for training
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [30]:
# before we start training, we need to set up the cpu/gpu environment
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [31]:
# move the model to the device
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
# activate training mode
model.train()

In [32]:
# initialize the optimizer (Adam with weight decay - reduces overfitting)
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [33]:
# Training loop
from tqdm import tqdm

epochs = 2
for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, next_sentence_label=next_sentence_label, labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        


Epoch 0: 100%|██████████| 40/40 [00:09<00:00,  4.19it/s, loss=0.829]
Epoch 1: 100%|██████████| 40/40 [00:07<00:00,  5.37it/s, loss=0.696]
