In [16]:
!pip install seqeval
!pip install pytorch_pretrained_bert

import os

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam



### Intro 

We'll try now a deep learning model using google bert, pytorch and tensorflow.

First, let's again load our data:

In [0]:
data = pd.read_csv("dataset_biluo.csv", encoding="utf8").fillna(method="ffill")

Now, as bert expect sequences, let's create a sentence getter:

In [0]:
class SentenceGetter(object):

    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data["Token"]
        self.labels = data["BILUO"]

    def sentences(self):
        sent = []
        counter = 0

        for token, label in zip(self.tokens, self.labels):
            if token == "DOCSTART":
                continue
            sent.append((token, label))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

    def get_next(self):
        try:
            while True:
                sent = []
                next_token = self.tokens[self.index]
                if next_token == "DOCSTART":
                    continue
                next_label = self.labels[self.index]
                sent.append((next_token, next_label))
                self.index += 1
                if next_token.strip() == ".":
                    return sent
        except:
            return None

getter = SentenceGetter(data)

Let's check our deep learning libraries are working properley:

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

Device: cuda
Number of gpus: 1
Name of gpu: Tesla K80


We'll also add some constants that will determine the maximum sequence length and maximum batch sizes that we will feed the gpu:

In [0]:
MAX_LEN = 75
bs = 32

Next, let's get all of our sentences and labels:

In [21]:
all_sentences = [[token for token, label in sent] for sent in getter.sentences()]
all_orig_labels = [[label for token, label in sent] for sent in getter.sentences()]

print(all_sentences[0])
print(all_orig_labels[0])

['אחד', 'מכל', 'שני', 'ישראלים', 'אוכלים', 'קורנפלקס', '.']
['O', 'O', 'O', 'U-MISC', 'O', 'O', 'O']


Moving forward, we'll want to split our dataset into train and test:

In [0]:
train_sentences, test_sentences, train_orig_labels, test_orig_labels = train_test_split(all_sentences, all_orig_labels, random_state=2018, test_size=0.2)

As bert expects a tokenized sentence, we'll need to use the BertTokenizer with multilingual support. We'll create a function to achieve this. It's important to note that bert tend to split words, or as they call it, split into word pieces. Therefore, we'll need to update our labels arrays and expend them.

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize(sentences, orig_labels):
    tokenized_texts = []
    labels = []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)

        assert len(bert_tokens) == len(bert_labels)

    return tokenized_texts, labels


train_tokenized_texts, train_labels = tokenize(train_sentences, train_orig_labels)
print(train_tokenized_texts[0])
print(train_labels[0])

['כ', '##שר', '##ון', 'ה', '##די', '##בור', 'שלו', ',', 'ה', '##עו', '##מק', 'האי', '##נט', '##ל', '##קט', '##וא', '##לי', ',', 'ה', '##נע', '##ימות', 'ו', '##ה', '##ח', '##מי', '##מות', 'ש', '##כ', '##נע', '##ו', 'אנשים', 'רבים', 'ב', '##אר', '##ה', '"', 'ב', 'שהוא', 'יהיה', 'יום', 'אחד', 'ה', '##ת', '##שוב', '##ה', 'ה', '##ד', '##מו', '##קר', '##טית', 'ל', '##רו', '##נל', '##ד', 'ר', '##יי', '##גן', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-LOC', 'U-LOC', 'U-LOC', 'U-LOC', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'B-PERS', 'B-PERS', 'B-PERS', 'L-PERS', 'L-PERS', 'L-PERS', 'O']


Next we need to create sequences with padding to give to bert. We'll add first some utilties to convert labels into numbers:

In [0]:
tags_vals = list(set(data["BILUO"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i: t for i, t in enumerate(tags_vals)}

Now, we can convert our sentences and labels into sequences with paddings

In [0]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                         maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    
    return input_ids, tags, attention_masks
  

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_labels)

We're almost done. All that is left is to make tensors and  data loaders:

In [0]:
tr_inputs = torch.tensor(input_ids)
tr_tags = torch.tensor(tags)
tr_masks = torch.tensor(attention_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

Now we're ready to create our bert model and train it:

In [27]:
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-uncased", num_labels=len(tag2idx))

model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


epochs = 50
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))


Epoch:   0%|          | 0/50 [00:00<?, ?it/s][A
Epoch:   2%|▏         | 1/50 [01:23<1:08:35, 83.99s/it][A

Train loss: 0.5742826463631642



Epoch:   4%|▍         | 2/50 [02:47<1:07:08, 83.92s/it][A

Train loss: 0.23441463506516116



Epoch:   6%|▌         | 3/50 [04:11<1:05:41, 83.86s/it][A

Train loss: 0.13328204736297514



Epoch:   8%|▊         | 4/50 [05:35<1:04:14, 83.80s/it][A

Train loss: 0.08778117500521519



Epoch:  10%|█         | 5/50 [06:58<1:02:47, 83.72s/it][A

Train loss: 0.061445168093030834



Epoch:  12%|█▏        | 6/50 [08:22<1:01:20, 83.64s/it][A

Train loss: 0.04548323885710151



Epoch:  14%|█▍        | 7/50 [09:45<59:51, 83.53s/it]  [A

Train loss: 0.03660793381133749



Epoch:  16%|█▌        | 8/50 [11:08<58:26, 83.49s/it][A

Train loss: 0.031578253379758496



Epoch:  18%|█▊        | 9/50 [12:32<57:00, 83.43s/it][A

Train loss: 0.026149470784505944



Epoch:  20%|██        | 10/50 [13:55<55:35, 83.38s/it][A

Train loss: 0.024650572097779794



Epoch:  22%|██▏       | 11/50 [15:18<54:09, 83.32s/it][A

Train loss: 0.020464223608328603



Epoch:  24%|██▍       | 12/50 [16:41<52:43, 83.25s/it][A

Train loss: 0.019343730964824373



Epoch:  26%|██▌       | 13/50 [18:04<51:18, 83.19s/it][A

Train loss: 0.019473633648436746



Epoch:  28%|██▊       | 14/50 [19:27<49:54, 83.17s/it][A

Train loss: 0.017595879141425276



Epoch:  30%|███       | 15/50 [20:50<48:30, 83.16s/it][A

Train loss: 0.015985191730713402



Epoch:  32%|███▏      | 16/50 [22:14<47:07, 83.15s/it][A

Train loss: 0.015251200580049628



Epoch:  34%|███▍      | 17/50 [23:37<45:43, 83.14s/it][A

Train loss: 0.014070691032059214



Epoch:  36%|███▌      | 18/50 [24:59<44:13, 82.92s/it][A

Train loss: 0.013946331318835785



Epoch:  38%|███▊      | 19/50 [26:21<42:44, 82.74s/it][A

Train loss: 0.014957198022324363



Epoch:  40%|████      | 20/50 [27:44<41:24, 82.83s/it][A

Train loss: 0.013813897001522558



Epoch:  42%|████▏     | 21/50 [29:08<40:04, 82.92s/it][A

Train loss: 0.012901134978703878



Epoch:  44%|████▍     | 22/50 [30:31<38:43, 82.98s/it][A

Train loss: 0.012477564987623029



Epoch:  46%|████▌     | 23/50 [31:54<37:20, 82.98s/it][A

Train loss: 0.012373598688425795



Epoch:  48%|████▊     | 24/50 [33:17<35:58, 83.01s/it][A

Train loss: 0.012640377651773577



Epoch:  50%|█████     | 25/50 [34:40<34:35, 83.03s/it][A

Train loss: 0.011609394831243718



Epoch:  52%|█████▏    | 26/50 [36:03<33:12, 83.03s/it][A

Train loss: 0.012544329565975608



Epoch:  54%|█████▍    | 27/50 [37:26<31:49, 83.04s/it][A

Train loss: 0.012571530880831919



Epoch:  56%|█████▌    | 28/50 [38:49<30:27, 83.05s/it][A

Train loss: 0.012002095417694453



Epoch:  58%|█████▊    | 29/50 [40:12<29:03, 83.03s/it][A

Train loss: 0.011563012176653209



Epoch:  60%|██████    | 30/50 [41:35<27:40, 83.01s/it][A

Train loss: 0.01084245480582647



Epoch:  62%|██████▏   | 31/50 [42:58<26:17, 83.01s/it][A

Train loss: 0.0103822481074014



Epoch:  64%|██████▍   | 32/50 [44:21<24:54, 83.04s/it][A

Train loss: 0.011874166080141786



Epoch:  66%|██████▌   | 33/50 [45:44<23:31, 83.05s/it][A

Train loss: 0.010574248218391504



Epoch:  68%|██████▊   | 34/50 [47:07<22:08, 83.03s/it][A

Train loss: 0.01025818191744663



Epoch:  70%|███████   | 35/50 [48:30<20:45, 83.05s/it][A

Train loss: 0.010829301946221587



Epoch:  72%|███████▏  | 36/50 [49:53<19:22, 83.06s/it][A

Train loss: 0.0110242228558356



Epoch:  74%|███████▍  | 37/50 [51:16<17:59, 83.05s/it][A

Train loss: 0.011021507889561263



Epoch:  76%|███████▌  | 38/50 [52:39<16:36, 83.06s/it][A

Train loss: 0.010396962568600789



Epoch:  78%|███████▊  | 39/50 [54:02<15:13, 83.04s/it][A

Train loss: 0.010025242890050221



Epoch:  80%|████████  | 40/50 [55:25<13:50, 83.03s/it][A

Train loss: 0.010115473899116486



Epoch:  82%|████████▏ | 41/50 [56:48<12:27, 83.01s/it][A

Train loss: 0.010234051905744881



Epoch:  84%|████████▍ | 42/50 [58:11<11:04, 83.05s/it][A

Train loss: 0.010227360988659555



Epoch:  86%|████████▌ | 43/50 [59:35<09:41, 83.05s/it][A

Train loss: 0.00987350589006113



Epoch:  88%|████████▊ | 44/50 [1:00:58<08:18, 83.07s/it][A

Train loss: 0.012119245149890812



Epoch:  90%|█████████ | 45/50 [1:02:21<06:55, 83.06s/it][A

Train loss: 0.011933143186687641



Epoch:  92%|█████████▏| 46/50 [1:03:44<05:32, 83.04s/it][A

Train loss: 0.010063321737825502



Epoch:  94%|█████████▍| 47/50 [1:05:07<04:09, 83.03s/it][A

Train loss: 0.011488681833366116



Epoch:  96%|█████████▌| 48/50 [1:06:30<02:46, 83.01s/it][A

Train loss: 0.00968193696282896



Epoch:  98%|█████████▊| 49/50 [1:07:53<01:22, 82.99s/it][A

Train loss: 0.009838790160655562



Epoch: 100%|██████████| 50/50 [1:09:16<00:00, 82.99s/it][A
[A

Train loss: 0.009232318416858712


Great, we now have a trained model. Let's test it:

In [30]:
def test_model():
  classes_without_O = ['U-PERCENT', 'L-PERS', 'U-PERS', 'L-ORG', 'L-LOC', 'I-ORG', 'I-LOC', 'B-ORG', 'L-DATE', 'I-MONEY', 'B-MISC', 'L-MISC', 'L-MONEY', 'B-LOC', 'B-PERS', 'I-PERS', 'U-DATE', 'B-DATE', 'U-LOC', 'B-MONEY', 'U-MISC', 'I-MISC', 'I-DATE', 'L-PERCENT', 'I-TIME', 'U-ORG', 'L-TIME', 'B-PERCENT', 'B-TIME', 'U-TIME', 'I-PERCENT', 'U-MONEY' ]

  test_tokenized_texts, test_labels = tokenize(test_sentences, test_orig_labels)
  input_ids, tags, attention_masks = pad_sentences_and_labels(test_tokenized_texts, test_labels)

  val_inputs = torch.tensor(input_ids)
  val_tags = torch.tensor(tags)
  val_masks = torch.tensor(attention_masks)

  test_data = TensorDataset(val_inputs, val_masks, val_tags)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  predictions, true_labels = [], []
  counter = 0
  for batch in test_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
          tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
          logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      true_labels.append(label_ids)

      tmp_eval_accuracy = flat_accuracy(logits, label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  print("Validation loss: {}".format(eval_loss))
  print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
  pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
  test_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
  print("F1-Score: {}".format(f1_score(pred_tags, test_tags)))

  y_true = pd.Series(test_tags)
  y_pred = pd.Series(pred_tags)
  cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
  report = classification_report(y_true, y_pred, labels=classes_without_O, target_names=classes_without_O)
  report_with_O = classification_report(y_true, y_pred)

  return cross_tab, report, report_with_O

#     print(test_tokenized_texts[0])
#     print([idx2tag.get(i) for i in predictions[0]])
#     print([idx2tag.get(i) for i in true_labels[0][0]])
    

cross_tab, report, report_with_O = test_model()

Validation loss: 0.14696417173451795
Validation Accuracy: 0.9790476190476189
F1-Score: 0.8141135972461273


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [31]:
print(report)

              precision    recall  f1-score   support

   U-PERCENT       0.91      0.92      0.92        88
      L-PERS       0.92      0.88      0.90       389
      U-PERS       0.88      0.81      0.84       432
       L-ORG       0.81      0.86      0.84       377
       L-LOC       0.80      0.78      0.79       132
       I-ORG       0.75      0.82      0.78        92
       I-LOC       0.88      0.67      0.76        57
       B-ORG       0.83      0.77      0.80       303
      L-DATE       0.85      0.86      0.86       103
     I-MONEY       0.80      0.86      0.83        28
      B-MISC       0.56      0.59      0.57        41
      L-MISC       0.77      0.57      0.65        63
     L-MONEY       1.00      1.00      1.00        60
       B-LOC       0.77      0.81      0.79       124
      B-PERS       0.92      0.87      0.89       327
      I-PERS       0.89      0.92      0.91        52
      U-DATE       0.75      0.93      0.83       242
      B-DATE       0.85    

In [32]:
print(report_with_O)

              precision    recall  f1-score   support

      B-DATE       0.85      0.75      0.80        63
       B-LOC       0.77      0.81      0.79       124
      B-MISC       0.56      0.59      0.57        41
     B-MONEY       0.97      0.95      0.96       227
       B-ORG       0.83      0.77      0.80       303
   B-PERCENT       1.00      0.97      0.99        36
      B-PERS       0.92      0.87      0.89       327
      B-TIME       1.00      1.00      1.00         2
      I-DATE       0.67      0.58      0.62        24
       I-LOC       0.88      0.67      0.76        57
      I-MISC       0.93      0.29      0.44        49
     I-MONEY       0.80      0.86      0.83        28
       I-ORG       0.75      0.82      0.78        92
   I-PERCENT       1.00      1.00      1.00         1
      I-PERS       0.89      0.92      0.91        52
      L-DATE       0.85      0.86      0.86       103
       L-LOC       0.80      0.78      0.79       132
      L-MISC       0.77    

In [33]:
cross_tab

Prediction,B-DATE,B-LOC,B-MISC,B-MONEY,B-ORG,B-PERCENT,B-PERS,B-TIME,I-DATE,I-LOC,...,L-TIME,O,U-DATE,U-LOC,U-MISC,U-ORG,U-PERCENT,U-PERS,U-TIME,All
Real Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B-DATE,47,0,0,0,0,0,0,0,1,0,...,0,7,8,0,0,0,0,0,0,63
B-LOC,0,101,0,0,0,0,0,0,0,3,...,0,20,0,0,0,0,0,0,0,124
B-MISC,0,0,24,0,2,0,0,0,0,0,...,0,12,0,0,2,0,0,0,0,41
B-MONEY,0,0,0,216,0,0,0,0,0,0,...,0,3,2,0,0,0,0,0,0,227
B-ORG,0,10,3,0,234,0,5,0,0,0,...,0,42,0,1,0,0,0,0,0,303
B-PERCENT,0,0,0,0,0,35,0,0,0,0,...,0,1,0,0,0,0,0,0,0,36
B-PERS,0,0,0,0,0,0,285,0,0,0,...,0,38,0,0,0,0,0,4,0,327
B-TIME,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,2
I-DATE,2,0,0,0,0,0,0,0,14,0,...,0,6,0,0,0,0,0,0,0,24
I-LOC,0,7,0,0,0,0,0,0,0,38,...,0,9,0,0,0,0,0,0,0,57


### Conclusion

We presented a deep learning method using bert for learning the NER task in hebrew and we got better results
than previous methods. Awesome !!!


Hope you had fun ✋ 