In [None]:

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data = pd.read_csv("./data/NER/ner_dataset.csv", encoding="latin1").fillna(method="ffill")

In [None]:
data.loc[data['Sentence #']=='Sentence: 12']

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

### This is how the sentences in the dataset look like.

In [None]:
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentelennces]
sentences[10]

In [None]:
len(sentences[10])

In [None]:
labels[10]

### The sentences are annotated with the BIO-schema and the labels look like this.

In [None]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

In [None]:
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [None]:
tag2idx

### Prepare the sentences and labels
Before we can start fine-tuning the model, we have to prepare the data set for the use with pytorch and bert.

In [None]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [None]:
from pytorch_pretrained_bert.tokenization import BertTokenizer

### Here we fix some configurations. We will limit our sequence length to 75 tokens and we will use a batch size of 32 as suggested by the Bert paper. Note, that Bert natively supports sequences of up to 512 tokens.


In [None]:
MAX_LEN = 75 ##max length of token in sequence
bs = 32  ##batch size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 

### The Bert implementation comes with a pretrained tokenizer and a definied vocabulary. We load the one related to the smallest pre-trained model bert-base-uncased. Try also the cased variate since it is well suited for NER.

In [None]:
##load BertTokenizer class
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
from pathlib import Path
Path.home() / '.pytorch_pretrained_bert'

In [None]:
tokenizer

In [None]:
sentences[10]

In [None]:
### tokenize sentences

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[10])

In [None]:
len(tokenized_texts[10])## tokenized_texts[10] has 46 tokens but labels[10] only has 40 tags
## wordpiece tokenization make some words mis-matched with their labels.

In [None]:
labels[10]

In [None]:
sentences[10]

In [None]:
labels[10][27]

In [None]:
#cut and pad the token and label sequences to our desired length.
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [60]:
input_ids[10]

array([ 1999, 15335,  1010,  1037,  5164,  1997,  4584,  6126,  2037,
        4963,  1010,  2096,  2012,  1996,  2142,  3741,  6465,  1999,
        2047,  2259,  1010,  3539,  2704,  1042,  7140,  4215,  8254,
       25346,  2050,  2056,  1996, 12592,  2111,  2024, 24501,  4747,
       10421,  1999, 10723,  2107,  4740,  2013,  9846,  2037,  4382,
        1012,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])

In [63]:
sum(attention_masks[10])

46.0

In [61]:
tags[10]

array([ 9,  2,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 11,  9,
        9,  2, 12,  9,  7,  9,  7, 15,  9,  9, 16,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9])

In [16]:
tags[1]

array([16,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,  9,
        9,  9, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9])

In [17]:
tokenizer.vocab.items()



### The Bert model supports something called attention_mask, which is similar to the masking in keras. So here we create the mask to ignore the padded elements in the sequences.

In [18]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [19]:
len(attention_masks)

47959

In [20]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)

tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

Since we’re operating in pytorch, we have to convert the dataset to torch tensors.

In [21]:
## this step can be warpped
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [22]:
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

### The last step is to define the dataloaders. We shuffle the data at training time with the RandomSampler and at test time we just pass them sequentially with the SequentialSampler.

In [23]:
## this step can be warpped
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [24]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f0c110344e0>

## Setup the Bert model for finetuning
The pytorch-pretrained-bert package provides a BertForTokenClassification class for token-level predictions. BertForTokenClassification is a fine-tuning model that wraps BertModel and adds token-level classifier on top of the BertModel. The token-level classifier is a linear layer that takes as input the last hidden state of the sequence. We load the pre-trained bert-base-uncased model and provide the number of possible labels.

In [25]:
#init BertForTokenClassification class

##from_pretrained from BERTFromPreTrained, model parameter initialization and configuration 
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

In [26]:
len(tag2idx)

17

Now we have to pass the model parameters to the GPU. ### why?

In [27]:
model.cuda();

Before we can start the fine-tuning process, we have to setup the optimizer and add the parameters it should update. A common choice is the Adam optimizer. We also add some weight_decay as regularization to the main weight matrices. If you have limited resources, you can also try to just train the linear classifier on top of Bert and keep all other weights fixed. This will still give you a good performance.

In [28]:
## need to find out where comes model.classifier.named_parameters
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [51]:
list(model.classifier.named_parameters()

[('weight', Parameter containing:
  tensor([[ 0.0166, -0.0043,  0.0182,  ..., -0.0209, -0.0159, -0.0158],
          [-0.0362,  0.0152,  0.0101,  ...,  0.0250, -0.0006,  0.0103],
          [-0.0034, -0.0003,  0.0025,  ...,  0.0262,  0.0034, -0.0194],
          ...,
          [-0.0444,  0.0079, -0.0515,  ..., -0.0323,  0.0289,  0.0133],
          [-0.0102,  0.0033, -0.0170,  ...,  0.0090, -0.0065,  0.0349],
          [ 0.0488, -0.0037, -0.0018,  ...,  0.0535,  0.0110,  0.0204]],
         device='cuda:0', requires_grad=True)), ('bias', Parameter containing:
  tensor([-0.0102, -0.0030,  0.0017, -0.0149, -0.0127, -0.0134, -0.0199,  0.0009,
           0.0006,  0.0081,  0.0025,  0.0011, -0.0025, -0.0110, -0.0166, -0.0021,
          -0.0017], device='cuda:0', requires_grad=True))]

In [29]:
list(model.named_parameters())

[('bert.embeddings.word_embeddings.weight', Parameter containing:
  tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
          [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
          [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
          ...,
          [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
          [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
          [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
         device='cuda:0', requires_grad=True)),
 ('bert.embeddings.position_embeddings.weight', Parameter containing:
  tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
            6.8312e-04,  1.5441e-02],
          [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
            2.9753e-02, -5.3247e-03],
          [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
            1.8741e-02, -7.3140e-03],
          ...,
          [ 1.7418e-02,  3.4903e

In [30]:
param_optimizer = list(model.classifier.named_parameters())  
### Q1:model.classifier(where comes this classifier from BertTokenClassification, but argument for classifier is different than declaration)

## model is inheritated from BERTPretrainedModel.from_pretrained, 
##classifier is self.classifier=nn.Linear in BertForTokenClassification, named_parameters is from nn.module

##because from_pretrained is a class method, so model is initialized with from_pretrained argument and
#has properties of both (BertForTokenClassification and BertFromPretrained) 

In [31]:
optimizer_grouped_parameters

[{'amsgrad': False,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'lr': 3e-05,
  'params': [Parameter containing:
   tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
           [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
           [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
           ...,
           [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
           [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
           [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
          device='cuda:0', requires_grad=True), Parameter containing:
   tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
             6.8312e-04,  1.5441e-02],
           [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
             2.9753e-02, -5.3247e-03],
           [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
             1.8741e-02, -7.3140e-03],
           ...,
           [ 1.7418e-

In [32]:
param_optimizer

[('weight', Parameter containing:
  tensor([[ 1.8416e-02, -1.3082e-02,  1.8926e-02,  ..., -8.4249e-03,
           -9.7926e-03, -1.7061e-02],
          [-3.1130e-02,  8.5873e-03,  8.7999e-03,  ...,  2.0150e-02,
            1.5681e-05,  1.1500e-02],
          [-1.0066e-03,  5.3418e-03,  8.3282e-04,  ...,  2.4782e-02,
           -7.2673e-05, -1.5819e-02],
          ...,
          [-3.4727e-02, -2.0423e-04, -4.0106e-02,  ..., -2.3635e-02,
            3.2493e-02,  1.4612e-02],
          [-9.8385e-03,  1.0496e-03, -8.5880e-03,  ...,  1.2257e-02,
           -5.4645e-03,  2.8671e-02],
          [ 3.9575e-02, -2.5782e-03, -5.0689e-04,  ...,  4.7664e-02,
            1.4075e-02,  1.9126e-02]], device='cuda:0', requires_grad=True)),
 ('bias', Parameter containing:
  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         device='cuda:0', requires_grad=True))]

In [33]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 3e-05
    weight_decay: 0
    weight_decay_rate: 0.01

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 3e-05
    weight_decay: 0
    weight_decay_rate: 0.0
)

In [34]:
param_optimizer[1]

('bias', Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        device='cuda:0', requires_grad=True))

In [35]:
type(train_dataloader)

torch.utils.data.dataloader.DataLoader

In [36]:
model.classifier ## 这个classifier显然已经take了 bert的768 feature and 17个label self.classifier = nn.Linear(config.hidden_size, num_labels)

Linear(in_features=768, out_features=17, bias=True)

In [37]:
# is there any place used "forward" in BertForTokenClassification? yes, overwrite the original forward

In [38]:
#in tokenization.py, didn't find  declaration for cls: tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)

In [39]:
param_optimizer[1]  ## Q3 which part is pre-trained? 

('bias', Parameter containing:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        device='cuda:0', requires_grad=True))

In [40]:
param_optimizer[0]

('weight', Parameter containing:
 tensor([[ 1.8416e-02, -1.3082e-02,  1.8926e-02,  ..., -8.4249e-03,
          -9.7926e-03, -1.7061e-02],
         [-3.1130e-02,  8.5873e-03,  8.7999e-03,  ...,  2.0150e-02,
           1.5681e-05,  1.1500e-02],
         [-1.0066e-03,  5.3418e-03,  8.3282e-04,  ...,  2.4782e-02,
          -7.2673e-05, -1.5819e-02],
         ...,
         [-3.4727e-02, -2.0423e-04, -4.0106e-02,  ..., -2.3635e-02,
           3.2493e-02,  1.4612e-02],
         [-9.8385e-03,  1.0496e-03, -8.5880e-03,  ...,  1.2257e-02,
          -5.4645e-03,  2.8671e-02],
         [ 3.9575e-02, -2.5782e-03, -5.0689e-04,  ...,  4.7664e-02,
           1.4075e-02,  1.9126e-02]], device='cuda:0', requires_grad=True))

In [41]:
model.config  ##config json file is loaded from pretrained_model

{
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [42]:
#   Q5 what is this cls?
    ##def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
    #                    from_tf=False, *inputs, **kwargs):
##cls refers as class itself

In [43]:
   #Q6: *inputs, ** kwargs, and super 
    #def __init__(self, config, *inputs, **kwargs):
    #    super(BertPreTrainedModel, self).__init__()
## super here means initialize the parent class of BertPreTRainedModel, which is nn.Module

### First we define some metrics, we want to track while training. We use the f1_score from the seqeval package. You can find more details here. And we use simple accuracy on a token level comparable to the accuracy in keras.

In [44]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
epochs = 5
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.2750533165049783
Validation loss: 0.17589829062422116
Validation Accuracy: 0.8939599206349205


Epoch:  20%|██        | 1/5 [23:59<1:35:59, 1439.85s/it]

F1-Score: 0.3003589948145193
Train loss: 0.14845023628702597
Validation loss: 0.14025111734867096
Validation Accuracy: 0.9124428571428571


Epoch:  40%|████      | 2/5 [48:00<1:12:00, 1440.21s/it]

F1-Score: 0.36919592298980747
Train loss: 0.11774741971742408
Validation loss: 0.1302069279551506
Validation Accuracy: 0.9195091269841272


Epoch:  60%|██████    | 3/5 [1:12:02<48:01, 1440.59s/it]

F1-Score: 0.40171197223828803


Evaluation

In [None]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))