In [1]:
!pip install seqeval==0.0.5
!pip install pytorch_pretrained_bert==0.4.0

import os

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam



Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Intro 

We'll try now a deep learning model using google bert, pytorch and tensorflow.

First, let's again load our data:

In [0]:
data = pd.read_csv("dataset_biluo.csv", encoding="utf8").fillna(method="ffill")

Now, as bert expect sequences, let's create a sentence getter:

In [0]:
class SentenceGetter(object):

    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data["Token"]
        self.labels = data["BILUO"]

    def sentences(self):
        sent = []
        counter = 0

        for token, label in zip(self.tokens, self.labels):
            if token == "DOCSTART":
                continue
            sent.append((token, label))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

    def get_next(self):
        try:
            while True:
                sent = []
                next_token = self.tokens[self.index]
                if next_token == "DOCSTART":
                    continue
                next_label = self.labels[self.index]
                sent.append((next_token, next_label))
                self.index += 1
                if next_token.strip() == ".":
                    return sent
        except:
            return None

getter = SentenceGetter(data)

Let's check our deep learning libraries are working properley:

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

Device: cuda
Number of gpus: 1
Name of gpu: Tesla K80


We'll also add some constants that will determine the maximum sequence length and maximum batch sizes that we will feed the gpu:

In [0]:
MAX_LEN = 75
bs = 32

Next, let's get all of our sentences and labels:

In [6]:
all_sentences = [[token for token, label in sent] for sent in getter.sentences()]
all_orig_labels = [[label for token, label in sent] for sent in getter.sentences()]

print(all_sentences[0])
print(all_orig_labels[0])

['אחד', 'מכל', 'שני', 'ישראלים', 'אוכלים', 'קורנפלקס', '.']
['O', 'O', 'O', 'U-MISC', 'O', 'O', 'O']


Moving forward, we'll want to split our dataset into train and test:

In [0]:
train_sentences, test_sentences, train_orig_labels, test_orig_labels = train_test_split(all_sentences, all_orig_labels, random_state=2018, test_size=0.2)

As bert expects a tokenized sentence, we'll need to use the BertTokenizer with multilingual support. We'll create a function to achieve this. It's important to note that bert tend to split words, or as they call it, split into word pieces. Therefore, we'll need to update our labels arrays and expend them.

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize(sentences, orig_labels):
    tokenized_texts = []
    labels = []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)

        assert len(bert_tokens) == len(bert_labels)

    return tokenized_texts, labels


train_tokenized_texts, train_labels = tokenize(train_sentences, train_orig_labels)
print(train_tokenized_texts[0])
print(train_labels[0])

['כ', '##שר', '##ון', 'ה', '##די', '##בור', 'שלו', ',', 'ה', '##עו', '##מק', 'האי', '##נט', '##ל', '##קט', '##וא', '##לי', ',', 'ה', '##נע', '##ימות', 'ו', '##ה', '##ח', '##מי', '##מות', 'ש', '##כ', '##נע', '##ו', 'אנשים', 'רבים', 'ב', '##אר', '##ה', '"', 'ב', 'שהוא', 'יהיה', 'יום', 'אחד', 'ה', '##ת', '##שוב', '##ה', 'ה', '##ד', '##מו', '##קר', '##טית', 'ל', '##רו', '##נל', '##ד', 'ר', '##יי', '##גן', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-LOC', 'U-LOC', 'U-LOC', 'U-LOC', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'B-PERS', 'B-PERS', 'B-PERS', 'L-PERS', 'L-PERS', 'L-PERS', 'O']


Next we need to create sequences with padding to give to bert. We'll add first some utilties to convert labels into numbers:

In [0]:
tags_vals = list(set(data["BILUO"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i: t for i, t in enumerate(tags_vals)}

Now, we can convert our sentences and labels into sequences with paddings

In [0]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                         maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    
    return input_ids, tags, attention_masks
  

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_labels)

We're almost done. All that is left is to make tensors and  data loaders:

In [0]:
tr_inputs = torch.tensor(input_ids)
tr_tags = torch.tensor(tags)
tr_masks = torch.tensor(attention_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

Now we're ready to create our bert model and train it:

In [12]:
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-uncased", num_labels=len(tag2idx))

model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


epochs = 50
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:   2%|▏         | 1/50 [01:21<1:06:15, 81.13s/it]

Train loss: 0.5433959453194229


Epoch:   4%|▍         | 2/50 [02:42<1:04:55, 81.16s/it]

Train loss: 0.20792146919318188


Epoch:   6%|▌         | 3/50 [04:03<1:03:36, 81.20s/it]

Train loss: 0.13101311837449486


Epoch:   8%|▊         | 4/50 [05:25<1:02:17, 81.25s/it]

Train loss: 0.09094174454609553


Epoch:  10%|█         | 5/50 [06:46<1:00:54, 81.22s/it]

Train loss: 0.0648388571255369


Epoch:  12%|█▏        | 6/50 [08:07<59:31, 81.17s/it]  

Train loss: 0.0480892972583756


Epoch:  14%|█▍        | 7/50 [09:28<58:05, 81.06s/it]

Train loss: 0.03666393825623356


Epoch:  16%|█▌        | 8/50 [10:48<56:42, 81.00s/it]

Train loss: 0.031704388512873355


Epoch:  18%|█▊        | 9/50 [12:09<55:19, 80.97s/it]

Train loss: 0.02821806796113558


Epoch:  20%|██        | 10/50 [13:30<53:56, 80.90s/it]

Train loss: 0.022489383529273817


Epoch:  22%|██▏       | 11/50 [14:51<52:34, 80.88s/it]

Train loss: 0.020221201427004957


Epoch:  24%|██▍       | 12/50 [16:12<51:13, 80.88s/it]

Train loss: 0.019611342269697307


Epoch:  26%|██▌       | 13/50 [17:32<49:50, 80.83s/it]

Train loss: 0.01625634198375966


Epoch:  28%|██▊       | 14/50 [18:53<48:29, 80.82s/it]

Train loss: 0.015636694246765087


Epoch:  30%|███       | 15/50 [20:14<47:08, 80.80s/it]

Train loss: 0.016578153270170277


Epoch:  32%|███▏      | 16/50 [21:35<45:46, 80.78s/it]

Train loss: 0.01468606274517506


Epoch:  34%|███▍      | 17/50 [22:56<44:25, 80.78s/it]

Train loss: 0.013702502580543544


Epoch:  36%|███▌      | 18/50 [24:16<43:04, 80.78s/it]

Train loss: 0.014341677434133067


Epoch:  38%|███▊      | 19/50 [25:37<41:44, 80.79s/it]

Train loss: 0.013777400193057586


Epoch:  40%|████      | 20/50 [26:58<40:23, 80.78s/it]

Train loss: 0.012922394686568252


Epoch:  42%|████▏     | 21/50 [28:19<39:01, 80.75s/it]

Train loss: 0.011720016023810998


Epoch:  44%|████▍     | 22/50 [29:39<37:39, 80.68s/it]

Train loss: 0.01294691033695859


Epoch:  46%|████▌     | 23/50 [31:00<36:18, 80.67s/it]

Train loss: 0.011724586459164174


Epoch:  48%|████▊     | 24/50 [32:20<34:58, 80.69s/it]

Train loss: 0.012008451855235538


Epoch:  50%|█████     | 25/50 [33:41<33:37, 80.71s/it]

Train loss: 0.01237493138587861


Epoch:  52%|█████▏    | 26/50 [35:02<32:17, 80.72s/it]

Train loss: 0.011666864314817903


Epoch:  54%|█████▍    | 27/50 [36:23<30:57, 80.74s/it]

Train loss: 0.01206289998896673


Epoch:  56%|█████▌    | 28/50 [37:43<29:36, 80.74s/it]

Train loss: 0.011727892158461022


Epoch:  58%|█████▊    | 29/50 [39:04<28:15, 80.75s/it]

Train loss: 0.011767537331634007


Epoch:  60%|██████    | 30/50 [40:25<26:55, 80.75s/it]

Train loss: 0.011153251808682848


Epoch:  62%|██████▏   | 31/50 [41:46<25:34, 80.75s/it]

Train loss: 0.010810013226647344


Epoch:  64%|██████▍   | 32/50 [43:06<24:13, 80.74s/it]

Train loss: 0.011260053761631113


Epoch:  66%|██████▌   | 33/50 [44:27<22:52, 80.75s/it]

Train loss: 0.01114532490329886


Epoch:  68%|██████▊   | 34/50 [45:48<21:32, 80.76s/it]

Train loss: 0.011025763444464516


Epoch:  70%|███████   | 35/50 [47:09<20:11, 80.74s/it]

Train loss: 0.010447952828447645


Epoch:  72%|███████▏  | 36/50 [48:30<18:50, 80.76s/it]

Train loss: 0.010628819428987938


Epoch:  74%|███████▍  | 37/50 [49:50<17:29, 80.75s/it]

Train loss: 0.010998763333730123


Epoch:  76%|███████▌  | 38/50 [51:11<16:08, 80.70s/it]

Train loss: 0.010311707571137375


Epoch:  78%|███████▊  | 39/50 [52:32<14:47, 80.71s/it]

Train loss: 0.010557222495539642


Epoch:  80%|████████  | 40/50 [53:52<13:27, 80.74s/it]

Train loss: 0.011203645423836546


Epoch:  82%|████████▏ | 41/50 [55:13<12:06, 80.73s/it]

Train loss: 0.009283937755641585


Epoch:  84%|████████▍ | 42/50 [56:34<10:45, 80.70s/it]

Train loss: 0.009275855555092732


Epoch:  86%|████████▌ | 43/50 [57:54<09:24, 80.71s/it]

Train loss: 0.009000133224303064


Epoch:  88%|████████▊ | 44/50 [59:15<08:04, 80.72s/it]

Train loss: 0.011253994078243174


Epoch:  90%|█████████ | 45/50 [1:00:36<06:43, 80.73s/it]

Train loss: 0.010334758575212348


Epoch:  92%|█████████▏| 46/50 [1:01:57<05:23, 80.75s/it]

Train loss: 0.009987548313802108


Epoch:  94%|█████████▍| 47/50 [1:03:17<04:02, 80.75s/it]

Train loss: 0.010879647721684493


Epoch:  96%|█████████▌| 48/50 [1:04:38<02:41, 80.74s/it]

Train loss: 0.010385963367298245


Epoch:  98%|█████████▊| 49/50 [1:05:59<01:20, 80.75s/it]

Train loss: 0.009746180940886534


Epoch: 100%|██████████| 50/50 [1:07:20<00:00, 80.69s/it]

Train loss: 0.009685530325823269





Great, we now have a trained model. Let's test it:

In [13]:
def simplify_label(label):
  if len(label.split("-"))>1:
    label = label.split("-")[1]
  return label

def test_model():
  classes_without_O = ['U-PERCENT', 'L-PERS', 'U-PERS', 'L-ORG', 'L-LOC', 'I-ORG', 'I-LOC', 'B-ORG', 'L-DATE', 'I-MONEY', 'B-MISC', 'L-MISC', 'L-MONEY', 'B-LOC', 'B-PERS', 'I-PERS', 'U-DATE', 'B-DATE', 'U-LOC', 'B-MONEY', 'U-MISC', 'I-MISC', 'I-DATE', 'L-PERCENT', 'I-TIME', 'U-ORG', 'L-TIME', 'B-PERCENT', 'B-TIME', 'U-TIME', 'I-PERCENT', 'U-MONEY' ]

  test_tokenized_texts, test_labels = tokenize(test_sentences, test_orig_labels)
  input_ids, tags, attention_masks = pad_sentences_and_labels(test_tokenized_texts, test_labels)

  val_inputs = torch.tensor(input_ids)
  val_tags = torch.tensor(tags)
  val_masks = torch.tensor(attention_masks)

  test_data = TensorDataset(val_inputs, val_masks, val_tags)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  predictions, true_labels = [], []
  counter = 0
  for batch in test_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
          tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
          logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      true_labels.append(label_ids)

      tmp_eval_accuracy = flat_accuracy(logits, label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  print("Validation loss: {}".format(eval_loss))
  print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
  pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
  test_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
  print("F1-Score: {}".format(f1_score(pred_tags, test_tags)))

  y_true = pd.Series(test_tags)
  y_pred = pd.Series(pred_tags)
  cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
  report = classification_report(y_true, y_pred, labels=classes_without_O, target_names=classes_without_O)
  report_with_O = classification_report(y_true, y_pred)

  classes_without_O = ['DATE', 'LOC', 'MISC', 'MONEY', 'ORG', 'PERCENT', 'PERS', 'TIME']
  t_y_true = [simplify_label(tags_vals[p_i]) for p in predictions for p_i in p]
  t_y_pred = [simplify_label(tags_vals[l_ii]) for l in true_labels for l_i in l for l_ii in l_i]
  
  transformed_y_true = pd.Series(t_y_true)
  transformed_y_pred = pd.Series(t_y_pred)

  cross_tab_transformed = pd.crosstab(transformed_y_true, transformed_y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
  report_transformed = classification_report(transformed_y_true, transformed_y_pred, labels=classes_without_O, target_names=classes_without_O)
  report_with_O_transformed = classification_report(transformed_y_true, transformed_y_pred)
  
  return cross_tab, report, report_with_O, cross_tab_transformed, report_transformed, report_with_O_transformed

#     print(test_tokenized_texts[0])
#     print([idx2tag.get(i) for i in predictions[0]])
#     print([idx2tag.get(i) for i in true_labels[0][0]])
    

cross_tab, report, report_with_O, cross_tab_transformed, report_transformed, report_with_O_transformed = test_model()

Validation loss: 0.14430818607174192
Validation Accuracy: 0.9790873015873013
F1-Score: 0.8195149464184996


  'recall', 'true', average, warn_for)


In [14]:
print(report)

              precision    recall  f1-score   support

   U-PERCENT       0.92      0.92      0.92        88
      L-PERS       0.91      0.95      0.93       389
      U-PERS       0.92      0.80      0.85       432
       L-ORG       0.82      0.83      0.82       377
       L-LOC       0.77      0.78      0.78       132
       I-ORG       0.68      0.46      0.55        92
       I-LOC       0.93      0.70      0.80        57
       B-ORG       0.77      0.79      0.78       303
      L-DATE       0.82      0.84      0.83       103
     I-MONEY       0.90      1.00      0.95        28
      B-MISC       0.69      0.59      0.63        41
      L-MISC       0.84      0.57      0.68        63
     L-MONEY       0.98      1.00      0.99        60
       B-LOC       0.81      0.85      0.83       124
      B-PERS       0.90      0.92      0.91       327
      I-PERS       0.89      0.94      0.92        52
      U-DATE       0.79      0.90      0.84       242
      B-DATE       0.76    

In [15]:
print(report_with_O)

              precision    recall  f1-score   support

      B-DATE       0.76      0.83      0.79        63
       B-LOC       0.81      0.85      0.83       124
      B-MISC       0.69      0.59      0.63        41
     B-MONEY       0.93      0.96      0.95       227
       B-ORG       0.77      0.79      0.78       303
   B-PERCENT       0.92      0.97      0.95        36
      B-PERS       0.90      0.92      0.91       327
      B-TIME       1.00      1.00      1.00         2
      I-DATE       0.79      0.62      0.70        24
       I-LOC       0.93      0.70      0.80        57
      I-MISC       1.00      0.27      0.42        49
     I-MONEY       0.90      1.00      0.95        28
       I-ORG       0.68      0.46      0.55        92
   I-PERCENT       1.00      1.00      1.00         1
      I-PERS       0.89      0.94      0.92        52
      I-TIME       0.00      0.00      0.00         0
      L-DATE       0.82      0.84      0.83       103
       L-LOC       0.77    

In [16]:
cross_tab

Prediction,B-DATE,B-LOC,B-MISC,B-MONEY,B-ORG,B-PERCENT,B-PERS,B-TIME,I-DATE,I-LOC,...,O,U-DATE,U-LOC,U-MISC,U-MONEY,U-ORG,U-PERCENT,U-PERS,U-TIME,All
Real Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B-DATE,52,0,0,0,0,0,0,0,1,0,...,4,6,0,0,0,0,0,0,0,63
B-LOC,0,105,2,0,1,0,0,0,0,0,...,16,0,0,0,0,0,0,0,0,124
B-MISC,0,0,24,0,0,0,0,0,0,0,...,15,0,0,2,0,0,0,0,0,41
B-MONEY,0,0,0,219,0,0,0,0,0,0,...,3,2,0,0,0,0,0,0,0,227
B-ORG,3,0,0,0,239,0,5,0,0,0,...,45,0,0,0,0,4,0,0,0,303
B-PERCENT,0,0,0,0,0,35,0,0,0,0,...,1,0,0,0,0,0,0,0,0,36
B-PERS,0,0,2,0,5,0,300,0,0,0,...,15,0,0,0,0,0,0,5,0,327
B-TIME,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,2
I-DATE,0,0,0,0,3,0,0,0,15,0,...,0,0,0,0,0,0,0,0,0,24
I-LOC,0,6,0,0,0,0,0,0,0,40,...,1,0,0,0,0,0,0,0,7,57


Now, let's simplify the tags to use 'report per chunk'

In [17]:
print(report_transformed)

              precision    recall  f1-score   support

        DATE       0.90      0.83      0.86       470
         LOC       0.83      0.90      0.86       670
        MISC       0.82      0.96      0.88       459
       MONEY       0.95      0.95      0.95       328
         ORG       0.80      0.85      0.82      1013
     PERCENT       0.96      0.95      0.95       139
        PERS       0.91      0.94      0.92      1170
        TIME       0.53      0.55      0.54        29

   micro avg       0.86      0.90      0.88      4278
   macro avg       0.84      0.86      0.85      4278
weighted avg       0.86      0.90      0.88      4278



In [18]:
print(report_with_O_transformed)

              precision    recall  f1-score   support

        DATE       0.90      0.83      0.86       470
         LOC       0.83      0.90      0.86       670
        MISC       0.82      0.96      0.88       459
       MONEY       0.95      0.95      0.95       328
           O       0.99      0.99      0.99     43872
         ORG       0.80      0.85      0.82      1013
     PERCENT       0.96      0.95      0.95       139
        PERS       0.91      0.94      0.92      1170
        TIME       0.53      0.55      0.54        29

   micro avg       0.98      0.98      0.98     48150
   macro avg       0.86      0.88      0.87     48150
weighted avg       0.98      0.98      0.98     48150



In [19]:
cross_tab_transformed

Prediction,DATE,LOC,MISC,MONEY,O,ORG,PERCENT,PERS,TIME,All
Real Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DATE,388,0,3,9,51,6,0,0,13,470
LOC,0,600,7,0,45,16,0,2,0,670
MISC,0,4,439,0,5,7,0,4,0,459
MONEY,7,0,0,312,9,0,0,0,0,328
O,23,97,69,6,43412,172,6,86,1,43872
ORG,10,8,17,0,107,858,0,13,0,1013
PERCENT,0,0,0,0,7,0,132,0,0,139
PERS,0,5,0,0,51,19,0,1095,0,1170
TIME,4,7,0,0,2,0,0,0,16,29
All,432,721,535,327,43689,1078,138,1200,30,48150


### Conclusion

We presented a deep learning method using bert for learning the NER task in hebrew and we got better results
than previous methods. Awesome !!!


Hope you had fun ✋ 