In [1]:
!pip install seqeval==0.0.5
!pip install pytorch_pretrained_bert==0.4.0

import os

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam



Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Intro 

We'll try now a deep learning model using google bert, pytorch and tensorflow.

First, let's again load our data:

In [3]:

from google.colab import files
uploaded = files.upload()

Saving dataset_biluo.csv to dataset_biluo (1).csv


In [0]:
import io
data = pd.read_csv(io.BytesIO(uploaded['dataset_biluo.csv']))

In [6]:
data.shape

(62911, 14)

In [0]:
#data = pd.read_csv("dataset_biluo.csv", encoding="utf8").fillna(method="ffill")

Now, as bert expect sequences, let's create a sentence getter:

In [0]:
data = data[~data['BILUO'].isna()]

In [0]:
class SentenceGetter(object):

    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data["Token"]
        self.labels = data["BILUO"]

    def sentences(self):
        sent = []
        counter = 0

        for token, label in zip(self.tokens, self.labels):
            if token == "DOCSTART":
                continue
            sent.append((token, label))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

    def get_next(self):
        try:
            while True:
                sent = []
                next_token = self.tokens[self.index]
                if next_token == "DOCSTART":
                    continue
                next_label = self.labels[self.index]
                sent.append((next_token, next_label))
                self.index += 1
                if next_token.strip() == ".":
                    return sent
        except:
            return None

getter = SentenceGetter(data)

Let's check our deep learning libraries are working properley:

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

Device: cuda
Number of gpus: 1
Name of gpu: Tesla K80


We'll also add some constants that will determine the maximum sequence length and maximum batch sizes that we will feed the gpu:

In [0]:
MAX_LEN = 75
bs = 32

Next, let's get all of our sentences and labels:

In [12]:
all_sentences = [[token for token, label in sent] for sent in getter.sentences()]
all_orig_labels = [[label for token, label in sent] for sent in getter.sentences()]

print(all_sentences[0])
print(all_orig_labels[0])

['אחד', 'מכל', 'שני', 'ישראלים', 'אוכלים', 'קורנפלקס', '.']
['O', 'O', 'O', 'U-MISC', 'O', 'O', 'O']


Moving forward, we'll want to split our dataset into train and test:

In [0]:
train_sentences, test_sentences, train_orig_labels, test_orig_labels = train_test_split(all_sentences, all_orig_labels, random_state=2019, test_size=0.2)

As bert expects a tokenized sentence, we'll need to use the BertTokenizer with multilingual support. We'll create a function to achieve this. It's important to note that bert tend to split words, or as they call it, split into word pieces. Therefore, we'll need to update our labels arrays and expend them.

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize(sentences, orig_labels):
    tokenized_texts = []
    labels = []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)

        assert len(bert_tokens) == len(bert_labels)

    return tokenized_texts, labels
  

BASIC_BIO_TAGS = {'MISC', 'PERCENT', 'LOC', 'TIME', 'ORG', 'PERS', 'DATE', 'MONEY'}

def split_tag_to_basic(tag):
    if len(tag.split("-")) > 1:
        return tag.split("-")[1].rstrip("\n")
    else:
        return tag

      
def tokenize_biluo(sentences, orig_labels):
  tokenized_texts = []
  labels = []
  fixed_labels_list = []
  for sent, sent_labels in zip(sentences, orig_labels):
      bert_tokens = []
      bert_labels = []
      for orig_token, orig_label in zip(sent, sent_labels):
          b_tokens = tokenizer.tokenize(orig_token)
          bert_tokens.extend(b_tokens)
          for b_token in b_tokens:
              bert_labels.append(orig_label)
      ### fix bert_labels
      fixed_labels = fix_to_biluo(bert_labels)

      ### finish fixing
      tokenized_texts.append(bert_tokens)
      labels.append(bert_labels)
      fixed_labels_list.append(fixed_labels)
      assert len(bert_tokens) == len(bert_labels) == len(fixed_labels)

  return tokenized_texts, fixed_labels_list



def fix_to_biluo(bert_labels):
    fixed_labels = [bert_labels[0]]
    for i in range(1, len(bert_labels) - 1):
        prev_tag, curr_tag, next_tag = bert_labels[i - 1], bert_labels[i], bert_labels[i + 1]
        new_tag = curr_tag
        if split_tag_to_basic(curr_tag) in BASIC_BIO_TAGS:
            if split_tag_to_basic(prev_tag) != split_tag_to_basic(curr_tag) and split_tag_to_basic(
                    curr_tag) != split_tag_to_basic(next_tag):
                new_tag = 'U-' + split_tag_to_basic(curr_tag)
            else:
                if split_tag_to_basic(prev_tag) == split_tag_to_basic(curr_tag):
                    if split_tag_to_basic(curr_tag) != split_tag_to_basic(next_tag):
                        new_tag = 'L-' + split_tag_to_basic(curr_tag)
                    else:  # split_tag_to_basic(curr_tag) == split_tag_to_basic(next_tag)
                        new_tag = 'I-' + split_tag_to_basic(curr_tag)
                else:  # split_tag_to_basic(prev_tag) != split_tag_to_basic(curr_tag) BUT split_tag_to_basic(curr_tag) != split_tag_to_basic(next_tag)
                    new_tag = 'B-' + split_tag_to_basic(curr_tag)
        fixed_labels.append(new_tag)
    if len(bert_labels) < 3:
        fixed_labels = bert_labels
    else:
        almost_last_tag = bert_labels[-2]
        last_tag = bert_labels[-1]
        if (last_tag != 'O') and almost_last_tag.split("-")[1] == last_tag.split("-")[1]:
            last_tag_fixed = "L-" + last_tag.split("-")[1]
        else:
            last_tag_fixed = last_tag
        fixed_labels.append(last_tag_fixed)
    return fixed_labels



train_tokenized_texts, train_labels = tokenize_biluo(train_sentences, train_orig_labels)
print(train_tokenized_texts[0])
print(train_labels[0])

['מעל', 'ל', '##מי', '##טת', '##ו', 'תל', '##וי', 'ת', '##צל', '##ום', 'קטן', 'של', 'ה', '##א', '##פי', '##פי', '##ור', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Next we need to create sequences with padding to give to bert. We'll add first some utilties to convert labels into numbers:

In [0]:
tags_vals = list(set(data["BILUO"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i: t for i, t in enumerate(tags_vals)}

Now, we can convert our sentences and labels into sequences with paddings

In [0]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                         maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    
    return input_ids, tags, attention_masks
  

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_labels)

We're almost done. All that is left is to make tensors and  data loaders:

In [0]:
tr_inputs = torch.tensor(input_ids)
tr_tags = torch.tensor(tags)
tr_masks = torch.tensor(attention_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

Now we're ready to create our bert model and train it:

In [26]:
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-uncased", num_labels=len(tag2idx))

model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


epochs = 50
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:   2%|▏         | 1/50 [01:20<1:05:59, 80.81s/it]

Train loss: 0.5006847572915348


Epoch:   4%|▍         | 2/50 [02:41<1:04:44, 80.92s/it]

Train loss: 0.171899883835404


Epoch:   6%|▌         | 3/50 [04:03<1:03:25, 80.96s/it]

Train loss: 0.09434538426590555


Epoch:   8%|▊         | 4/50 [05:24<1:02:05, 80.98s/it]

Train loss: 0.059884410420501674


Epoch:  10%|█         | 5/50 [06:44<1:00:43, 80.96s/it]

Train loss: 0.04619251081811977


Epoch:  12%|█▏        | 6/50 [08:05<59:19, 80.91s/it]  

Train loss: 0.037801852058849214


Epoch:  14%|█▍        | 7/50 [09:26<57:57, 80.87s/it]

Train loss: 0.032838938435838544


Epoch:  16%|█▌        | 8/50 [10:47<56:32, 80.78s/it]

Train loss: 0.024608670204969835


Epoch:  18%|█▊        | 9/50 [12:07<55:10, 80.75s/it]

Train loss: 0.021444941474193777


Epoch:  20%|██        | 10/50 [13:28<53:47, 80.70s/it]

Train loss: 0.020332408976959593


Epoch:  22%|██▏       | 11/50 [14:48<52:26, 80.67s/it]

Train loss: 0.019815217295408617


Epoch:  24%|██▍       | 12/50 [16:09<51:04, 80.64s/it]

Train loss: 0.019246658074589057


Epoch:  26%|██▌       | 13/50 [17:30<49:43, 80.64s/it]

Train loss: 0.017367968778413387


Epoch:  28%|██▊       | 14/50 [18:50<48:21, 80.60s/it]

Train loss: 0.014446061331617795


Epoch:  30%|███       | 15/50 [20:11<46:59, 80.56s/it]

Train loss: 0.015067016178408615


Epoch:  32%|███▏      | 16/50 [21:31<45:39, 80.56s/it]

Train loss: 0.014610423746714254


Epoch:  34%|███▍      | 17/50 [22:52<44:18, 80.58s/it]

Train loss: 0.014953538756760091


Epoch:  36%|███▌      | 18/50 [24:12<42:58, 80.59s/it]

Train loss: 0.014318795979632364


Epoch:  38%|███▊      | 19/50 [25:33<41:36, 80.53s/it]

Train loss: 0.013755249550830528


Epoch:  40%|████      | 20/50 [26:53<40:15, 80.52s/it]

Train loss: 0.012118294264511838


Epoch:  42%|████▏     | 21/50 [28:14<38:55, 80.52s/it]

Train loss: 0.0129296716078426


Epoch:  44%|████▍     | 22/50 [29:34<37:34, 80.52s/it]

Train loss: 0.012935134410472978


Epoch:  46%|████▌     | 23/50 [30:55<36:11, 80.44s/it]

Train loss: 0.010943377266183817


Epoch:  48%|████▊     | 24/50 [32:15<34:50, 80.40s/it]

Train loss: 0.011614675900933367


Epoch:  50%|█████     | 25/50 [33:35<33:30, 80.44s/it]

Train loss: 0.01085090564052218


Epoch:  52%|█████▏    | 26/50 [34:56<32:11, 80.46s/it]

Train loss: 0.010320832658225648


Epoch:  54%|█████▍    | 27/50 [36:16<30:50, 80.44s/it]

Train loss: 0.010910503515243575


Epoch:  56%|█████▌    | 28/50 [37:37<29:29, 80.44s/it]

Train loss: 0.010264637611625303


Epoch:  58%|█████▊    | 29/50 [38:57<28:09, 80.46s/it]

Train loss: 0.010219782522238537


Epoch:  60%|██████    | 30/50 [40:18<26:49, 80.49s/it]

Train loss: 0.010861489277930908


Epoch:  62%|██████▏   | 31/50 [41:38<25:29, 80.49s/it]

Train loss: 0.010236960679815828


Epoch:  64%|██████▍   | 32/50 [42:59<24:09, 80.51s/it]

Train loss: 0.011006879066921955


Epoch:  66%|██████▌   | 33/50 [44:19<22:47, 80.45s/it]

Train loss: 0.010452113090942464


Epoch:  68%|██████▊   | 34/50 [45:40<21:27, 80.46s/it]

Train loss: 0.010602976317678429


Epoch:  70%|███████   | 35/50 [47:00<20:06, 80.42s/it]

Train loss: 0.009360889698677684


Epoch:  72%|███████▏  | 36/50 [48:21<18:46, 80.44s/it]

Train loss: 0.010130257000801739


Epoch:  74%|███████▍  | 37/50 [49:41<17:26, 80.46s/it]

Train loss: 0.00902332532805977


Epoch:  76%|███████▌  | 38/50 [51:01<16:05, 80.44s/it]

Train loss: 0.010580341603296498


Epoch:  78%|███████▊  | 39/50 [52:22<14:44, 80.44s/it]

Train loss: 0.009869715051918096


Epoch:  80%|████████  | 40/50 [53:42<13:24, 80.44s/it]

Train loss: 0.010061184153910066


Epoch:  82%|████████▏ | 41/50 [55:03<12:03, 80.44s/it]

Train loss: 0.011281207915292017


Epoch:  84%|████████▍ | 42/50 [56:23<10:43, 80.45s/it]

Train loss: 0.01076957848308217


Epoch:  86%|████████▌ | 43/50 [57:44<09:23, 80.44s/it]

Train loss: 0.009830954659264535


Epoch:  88%|████████▊ | 44/50 [59:04<08:02, 80.45s/it]

Train loss: 0.009252706007244364


Epoch:  90%|█████████ | 45/50 [1:00:25<06:42, 80.47s/it]

Train loss: 0.010576286157348803


Epoch:  92%|█████████▏| 46/50 [1:01:45<05:21, 80.47s/it]

Train loss: 0.010846412866405085


Epoch:  94%|█████████▍| 47/50 [1:03:05<04:01, 80.42s/it]

Train loss: 0.00837226745882359


Epoch:  96%|█████████▌| 48/50 [1:04:26<02:40, 80.45s/it]

Train loss: 0.008605453914410813


Epoch:  98%|█████████▊| 49/50 [1:05:46<01:20, 80.48s/it]

Train loss: 0.006839181624214958


Epoch: 100%|██████████| 50/50 [1:07:07<00:00, 80.49s/it]

Train loss: 0.003102723898920863





Great, we now have a trained model. Let's test it:

In [27]:
def simplify_label(label):
  if len(label.split("-"))>1:
    label = label.split("-")[1]
  return label

def test_model():
  classes_without_O = ['U-PERCENT', 'L-PERS', 'U-PERS', 'L-ORG', 'L-LOC', 'I-ORG', 'I-LOC', 'B-ORG', 'L-DATE', 'I-MONEY', 'B-MISC', 'L-MISC', 'L-MONEY', 'B-LOC', 'B-PERS', 'I-PERS', 'U-DATE', 'B-DATE', 'U-LOC', 'B-MONEY', 'U-MISC', 'I-MISC', 'I-DATE', 'L-PERCENT', 'I-TIME', 'U-ORG', 'L-TIME', 'B-PERCENT', 'B-TIME', 'U-TIME', 'I-PERCENT', 'U-MONEY' ]

  test_tokenized_texts, test_labels = tokenize_biluo(test_sentences, test_orig_labels)
  input_ids, tags, attention_masks = pad_sentences_and_labels(test_tokenized_texts, test_labels)

  val_inputs = torch.tensor(input_ids)
  val_tags = torch.tensor(tags)
  val_masks = torch.tensor(attention_masks)

  test_data = TensorDataset(val_inputs, val_masks, val_tags)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  predictions, true_labels = [], []
  counter = 0
  for batch in test_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
          tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
          logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      true_labels.append(label_ids)

      tmp_eval_accuracy = flat_accuracy(logits, label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  print("Validation loss: {}".format(eval_loss))
  print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
  pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
  test_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
  print("F1-Score: {}".format(f1_score(pred_tags, test_tags)))

  y_true = pd.Series(test_tags)
  y_pred = pd.Series(pred_tags)
  cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
  report = classification_report(y_true, y_pred, labels=classes_without_O, target_names=classes_without_O)
  report_with_O = classification_report(y_true, y_pred)

  classes_without_O = ['DATE', 'LOC', 'MISC', 'MONEY', 'ORG', 'PERCENT', 'PERS', 'TIME']
  t_y_pred = [simplify_label(tags_vals[p_i]) for p in predictions for p_i in p]
  t_y_true = [simplify_label(tags_vals[l_ii]) for l in true_labels for l_i in l for l_ii in l_i]
  
  transformed_y_true = pd.Series(t_y_true)
  transformed_y_pred = pd.Series(t_y_pred)

  cross_tab_transformed = pd.crosstab(transformed_y_true, transformed_y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
  report_transformed = classification_report(transformed_y_true, transformed_y_pred, labels=classes_without_O, target_names=classes_without_O)
  report_with_O_transformed = classification_report(transformed_y_true, transformed_y_pred)
  
  return cross_tab, report, report_with_O, cross_tab_transformed, report_transformed, report_with_O_transformed

#     print(test_tokenized_texts[0])
#     print([idx2tag.get(i) for i in predictions[0]])
#     print([idx2tag.get(i) for i in true_labels[0][0]])
    

cross_tab, report, report_with_O, cross_tab_transformed, report_transformed, report_with_O_transformed = test_model()

Validation loss: 0.1250755246314622
Validation Accuracy: 0.984027777777778
F1-Score: 0.835216572504708


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [28]:
print(report)

              precision    recall  f1-score   support

   U-PERCENT       1.00      0.17      0.29         6
      L-PERS       0.92      0.90      0.91       265
      U-PERS       0.94      0.97      0.95        30
       L-ORG       0.81      0.83      0.82       209
       L-LOC       0.91      0.83      0.87       175
       I-ORG       0.86      0.81      0.83       602
       I-LOC       0.90      0.86      0.88       338
       B-ORG       0.83      0.84      0.83       204
      L-DATE       0.82      0.98      0.89        55
     I-MONEY       1.00      0.99      1.00       293
      B-MISC       0.87      0.83      0.85       123
      L-MISC       0.89      0.82      0.85       125
     L-MONEY       1.00      0.98      0.99        44
       B-LOC       0.89      0.83      0.86       168
      B-PERS       0.92      0.89      0.90       240
      I-PERS       0.95      0.90      0.92       574
      U-DATE       1.00      0.67      0.80         6
      B-DATE       0.76    

In [29]:
print(report_with_O)

              precision    recall  f1-score   support

      B-DATE       0.76      0.89      0.82        56
       B-LOC       0.89      0.83      0.86       168
      B-MISC       0.87      0.83      0.85       123
     B-MONEY       0.98      0.95      0.96        43
       B-ORG       0.83      0.84      0.83       204
   B-PERCENT       0.92      1.00      0.96        34
      B-PERS       0.92      0.89      0.90       240
      B-TIME       0.57      0.67      0.62         6
      I-DATE       0.86      0.97      0.91       287
       I-LOC       0.90      0.86      0.88       338
      I-MISC       0.85      0.72      0.78       202
     I-MONEY       1.00      0.99      1.00       293
       I-ORG       0.86      0.81      0.83       602
   I-PERCENT       1.00      0.97      0.99        71
      I-PERS       0.95      0.90      0.92       574
      I-TIME       0.57      0.71      0.63        45
      L-DATE       0.82      0.98      0.89        55
       L-LOC       0.91    

In [30]:
cross_tab

Prediction,B-DATE,B-LOC,B-MISC,B-MONEY,B-ORG,B-PERCENT,B-PERS,B-TIME,I-DATE,I-LOC,I-MISC,I-MONEY,I-ORG,I-PERCENT,I-PERS,I-TIME,L-DATE,L-LOC,L-MISC,L-MONEY,L-ORG,L-PERCENT,L-PERS,L-TIME,O,U-DATE,U-LOC,U-MISC,U-ORG,U-PERCENT,U-PERS,All
Real Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
B-DATE,50,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,56
B-LOC,0,140,0,0,4,0,3,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,1,0,0,0,0,168
B-MISC,0,1,102,0,1,0,1,0,0,1,2,0,1,0,1,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,123
B-MONEY,0,0,0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,43
B-ORG,0,0,1,0,171,0,4,1,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,204
B-PERCENT,0,0,0,0,0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,34
B-PERS,2,1,1,0,3,0,214,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,240
B-TIME,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,6
I-DATE,3,0,0,0,0,0,0,0,279,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,287
I-LOC,0,4,0,0,0,0,0,0,0,291,0,0,11,0,4,0,0,3,0,0,0,0,1,0,24,0,0,0,0,0,0,338


Now, let's simplify the tags to use 'report per chunk'

In [31]:
print(report_transformed)

              precision    recall  f1-score   support

        DATE       0.85      0.98      0.91       404
         LOC       0.93      0.87      0.90       740
        MISC       0.89      0.81      0.85       505
       MONEY       1.00      0.99      0.99       381
         ORG       0.86      0.83      0.85      1038
     PERCENT       0.98      0.98      0.98       146
        PERS       0.94      0.91      0.93      1109
        TIME       0.58      0.70      0.63        57

   micro avg       0.91      0.89      0.90      4380
   macro avg       0.88      0.88      0.88      4380
weighted avg       0.91      0.89      0.90      4380



In [32]:
print(report_with_O_transformed)

              precision    recall  f1-score   support

        DATE       0.85      0.98      0.91       404
         LOC       0.93      0.87      0.90       740
        MISC       0.89      0.81      0.85       505
       MONEY       1.00      0.99      0.99       381
           O       0.99      0.99      0.99     43695
         ORG       0.86      0.83      0.85      1038
     PERCENT       0.98      0.98      0.98       146
        PERS       0.94      0.91      0.93      1109
        TIME       0.58      0.70      0.63        57

    accuracy                           0.98     48075
   macro avg       0.89      0.90      0.89     48075
weighted avg       0.98      0.98      0.98     48075



In [33]:
cross_tab_transformed

Prediction,DATE,LOC,MISC,MONEY,O,ORG,PERCENT,PERS,TIME,All
Real Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DATE,394,0,0,0,10,0,0,0,0,404
LOC,0,645,0,0,56,28,0,11,0,740
MISC,0,7,409,0,70,13,0,6,0,505
MONEY,0,0,0,376,5,0,0,0,0,381
O,51,19,30,0,43452,85,3,27,28,43695
ORG,6,19,7,0,121,866,0,18,1,1038
PERCENT,0,0,0,0,3,0,143,0,0,146
PERS,10,6,11,0,59,14,0,1009,0,1109
TIME,0,0,0,0,17,0,0,0,40,57
All,461,696,457,376,43793,1006,146,1071,69,48075


### Conclusion

We presented a deep learning method using bert for learning the NER task in hebrew and we got better results
than previous methods. Awesome !!!


Hope you had fun ✋ 