In [1]:
# https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=Eh3ckSO0YMZW

In [2]:
# Load dataset
import pandas as pd 

df1 = pd.read_csv('./NER_DS_ENGLISH/TOR_ILLEGAL_ENG.CSV', encoding='utf-8')
df2 = pd.read_csv('./NER_DS_ENGLISH/CONLL_ENG.csv', encoding='utf-8')
df = pd.concat([df1, df2], axis=0)

In [3]:
len(set(df.sentence_idx.tolist()))

22407

In [4]:
# Split dataset:
split_thresh = df['sentence_idx'].max() * 0.9
df_train, df_valid = df[df['sentence_idx'] < split_thresh], df[df['sentence_idx'] >= split_thresh]
len(df_train), len(df_valid)

(387804, 27348)

In [5]:
agg_func = lambda s: [ [w,t] for w,t in zip(s["word"].values.tolist(),s["tag"].values.tolist())]

In [6]:
x_train_grouped = df_train.groupby("sentence_idx").apply(agg_func)
x_valid_grouped = df_valid.groupby("sentence_idx").apply(agg_func)

x_train_sentences = [[s[0] for s in sent] for sent in x_train_grouped.values]
x_valid_sentences = [[s[0] for s in sent] for sent in x_valid_grouped.values]

x_train_tags = [[t[1] for t in tag] for tag in x_train_grouped.values]
x_valid_tags = [[t[1] for t in tag] for tag in x_valid_grouped.values]

In [7]:
tag_list = df['tag'].unique()
label_map = {label: i for i, label in enumerate(tag_list)}
label_map_inv = {i: label for i, label in enumerate(tag_list)}
num_labels = len(tag_list) + 1
num_labels

17

In [8]:
print("Number of tags: {}".format(len(df.tag.unique())))
frequencies = df.tag.value_counts()
frequencies

Number of tags: 16


O         328511
B-LOC      11242
B-PER      10370
B-ORG      10087
B-DRG       9561
I-DRG       8619
I-PER       7110
I-ORG       6372
I-WEP       5753
B-MISC      5222
B-CUR       3911
B-WEP       3038
I-MISC      1966
I-LOC       1793
I-DAT       1291
B-DAT        306
Name: tag, dtype: int64

In [9]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:] not in tags.keys():
            tags[tag[2:]] = count
        else:
            tags[tag[2:]] += count
    continue

sorted(tags.items(), key=lambda x: x[1], reverse=True)

[('DRG', 18180),
 ('PER', 17480),
 ('ORG', 16459),
 ('LOC', 13035),
 ('WEP', 8791),
 ('MISC', 7188),
 ('CUR', 3911),
 ('DAT', 1597)]

In [10]:
# entities_to_remove = ["B-GROUP", "I-GROUP",
#                       "B-NUMBERS_O", "I-NUMBERS_O",
#                       "B-NORP", "I-NORP",
#                       "B-LAW", "I-LAW",
#                       "B-NUMBERS_C", "I-NUMBERS_C",
#                       "B-DATE", "I-DATE",
#                       "B-FACILITIES", "I-FACILITIES", 
#                       "B-ART", "I-ART", 
#                       "B-Time", "I-Time", 
#                       "B-GPE-LOCATION", "I-GPE-LOCATION",
#                      ]
# df = df[~df.tag.isin(entities_to_remove)]
# df.head()

In [11]:
labels_to_ids = {k: v for v, k in enumerate(df.tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(df.tag.unique())}
labels_to_ids

{'O': 0,
 'B-DRG': 1,
 'I-DRG': 2,
 'B-CUR': 3,
 'B-ORG': 4,
 'I-ORG': 5,
 'B-DAT': 6,
 'I-DAT': 7,
 'B-LOC': 8,
 'B-PER': 9,
 'I-PER': 10,
 'I-LOC': 11,
 'B-MISC': 12,
 'I-MISC': 13,
 'B-WEP': 14,
 'I-WEP': 15}

In [12]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
df = df.fillna(method='ffill')
df.head()

Unnamed: 0.1,sentence_idx,word,tag,Unnamed: 0
0,0,100g,O,
1,0,Purple,B-DRG,
2,0,Kush,I-DRG,
3,0,650,O,
4,0,GBP,B-CUR,


In [13]:
# let's create a new column called "sentence" which groups the words by sentence 
df['sentence'] = df[['sentence_idx','word','tag']].groupby(['sentence_idx'])['word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
df['word_labels'] = df[['sentence_idx','word','tag']].groupby(['sentence_idx'])['tag'].transform(lambda x: ','.join(x))
df.head()

Unnamed: 0.1,sentence_idx,word,tag,Unnamed: 0,sentence,word_labels
0,0,100g,O,,100g Purple Kush 650 GBP 1.28 EU rejects Germa...,"O,B-DRG,I-DRG,O,B-CUR,O,B-ORG,O,B-MISC,O,O,O,B..."
1,0,Purple,B-DRG,,100g Purple Kush 650 GBP 1.28 EU rejects Germa...,"O,B-DRG,I-DRG,O,B-CUR,O,B-ORG,O,B-MISC,O,O,O,B..."
2,0,Kush,I-DRG,,100g Purple Kush 650 GBP 1.28 EU rejects Germa...,"O,B-DRG,I-DRG,O,B-CUR,O,B-ORG,O,B-MISC,O,O,O,B..."
3,0,650,O,,100g Purple Kush 650 GBP 1.28 EU rejects Germa...,"O,B-DRG,I-DRG,O,B-CUR,O,B-ORG,O,B-MISC,O,O,O,B..."
4,0,GBP,B-CUR,,100g Purple Kush 650 GBP 1.28 EU rejects Germa...,"O,B-DRG,I-DRG,O,B-CUR,O,B-ORG,O,B-MISC,O,O,O,B..."


In [14]:
df = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,sentence,word_labels
0,100g Purple Kush 650 GBP 1.28 EU rejects Germa...,"O,B-DRG,I-DRG,O,B-CUR,O,B-ORG,O,B-MISC,O,O,O,B..."
1,5g Banana Kush 0.071 X Peter Blackburn,"O,B-DRG,I-DRG,O,O,B-PER,I-PER"
2,5g pure Cocaine 0.712 X BRUSSELS 22/08/1996,"O,O,B-DRG,O,O,B-LOC,O"
3,Please use this link to go back to the top of ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-ORG,I-ORG,O,O,O,..."
4,Please use this link to go to the Front Page o...,"O,O,O,O,O,O,O,O,O,O,O,O,B-DRG,I-DRG,I-DRG,O,O,..."


In [15]:
# Remove hidden split chat
df['sentence'] = df['sentence'].str.replace('\\xa0','', regex=True)

In [16]:
len(df)

21367

In [17]:
max([len(x.split()) for x in df.sentence.tolist()])

163

In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [19]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased", do_lower_case=False)

In [20]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [21]:
train_size = 0.8
train_dataset = df.sample(frac=train_size,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (21367, 2)
TRAIN Dataset: (17094, 2)
TEST Dataset: (4273, 2)


In [22]:
test_dataset

Unnamed: 0,sentence,word_labels
0,Please use this link to go back to the top of ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-ORG,I-ORG,O,O,O,..."
1,Please use this link to go to the Front Page o...,"O,O,O,O,O,O,O,O,O,O,O,O,B-DRG,I-DRG,I-DRG,O,O,..."
2,Please use this link to go to the Cocaine We d...,"O,O,O,O,O,O,O,O,B-DRG,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,"A systematic study has never been done , but p...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-DRG,O,O,O,O,..."
4,Those beautiful and absolutely tasteless Blott...,"O,O,O,O,O,B-DRG,O,O,O,B-DRG,I-DRG,I-DRG,O,O,O,..."
...,...,...
4268,"PSV , well on the way to their 14th league tit...","B-ORG,O,O,O,O,O,O,O,O,O,O,O,O,B-ORG,O,O,O,O,O,O,O"
4269,", are one of the surprise packages of the seas...","O,O,O,O,O,O,O,O,O,O,O"
4270,SOCCER - SPANISH FIRST DIVISION RESULT / STAND...,"O,O,B-MISC,O,O,O,O,O,O"
4271,Jack Charlton 's relationship with the people ...,"B-PER,I-PER,O,O,O,O,O,O,B-LOC,O,O,O,O,O,O,B-MI..."


In [23]:
# train_dataset.iloc[13518].word_labels

In [24]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[80]["input_ids"]), training_set[80]["labels"]):
    # print('{0:10}  {1}'.format(token, label))
    pass

In [25]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [26]:
from torch import cuda
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification


device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [27]:
# model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
# model.to(device)

from transformers import AutoModel, AutoTokenizer 

model = BertForTokenClassification.from_pretrained("bert-base-cased", 
                                                   num_labels=len(labels_to_ids), 
                                                   ignore_mismatched_sizes=True)
model.to(device);

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [28]:
inputs = training_set[0]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)
# labels = labels.type(torch.LongTensor)

input_ids = input_ids.to(device, dtype = torch.long)
attention_mask = attention_mask.to(device, dtype = torch.long)
labels = labels.to(device, dtype = torch.long)


In [29]:
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.7571, device='cuda:0', grad_fn=<NllLossBackward0>)

In [30]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 256, 16])

In [31]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [32]:
import os

def save_model(tk, ml, epk):    

    directory = "./bert-base-cased_en"
    
    p  = os.path.join(directory,f"epoch_{str(epk)}")
    os.makedirs(p, exist_ok=True)
    
    # save vocabulary of the tokenizer
    tk.save_vocabulary(p)
    
    # save the model weights and its configuration file
    ml.save_pretrained(p)
    print('All files saved')


In [33]:
# Defining the training function on the 80% of the dataset for tuning the bert model
# def train(epoch):

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
        
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train();

    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
#         labels = labels.type(torch.LongTensor).to(device)

        someoutput = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss, tr_logits = someoutput[0], someoutput[1]        
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}, ID: {idx}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    save_model(tk=tokenizer, ml = model, epk=epoch)

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    
    

Training epoch: 1
Training loss per 100 training steps: 2.7239410877227783, ID: 0
Training loss per 100 training steps: 1.083782142341727, ID: 100
Training loss per 100 training steps: 0.8172734546572414, ID: 200
Training loss per 100 training steps: 0.6689168287571087, ID: 300
Training loss per 100 training steps: 0.5725393748640122, ID: 400
Training loss per 100 training steps: 0.5026201060759331, ID: 500
Training loss per 100 training steps: 0.4487009208506832, ID: 600
Training loss per 100 training steps: 0.40928675053101804, ID: 700
Training loss per 100 training steps: 0.3777172126703792, ID: 800
Training loss per 100 training steps: 0.35252156773877724, ID: 900
Training loss per 100 training steps: 0.3321089236908681, ID: 1000
Training loss per 100 training steps: 0.315500746206207, ID: 1100
Training loss per 100 training steps: 0.29999685773143636, ID: 1200
Training loss per 100 training steps: 0.2878770898636731, ID: 1300
Training loss per 100 training steps: 0.276881389566521

In [34]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval();
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            someoutput = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = someoutput[0], someoutput[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [35]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.04244834929704666
Validation loss per 100 evaluation steps: 0.04803369089718712
Validation loss per 100 evaluation steps: 0.042919962155403606
Validation loss per 100 evaluation steps: 0.04803119140858545
Validation loss per 100 evaluation steps: 0.05132356928434296
Validation loss per 100 evaluation steps: 0.054814144338526966
Validation loss per 100 evaluation steps: 0.05314136689419671
Validation loss per 100 evaluation steps: 0.051927018797485056
Validation loss per 100 evaluation steps: 0.052388068023410075
Validation loss per 100 evaluation steps: 0.053658724621244006
Validation loss per 100 evaluation steps: 0.05165153940899407
Validation loss per 100 evaluation steps: 0.05078425480106596
Validation loss per 100 evaluation steps: 0.05241333129485219
Validation loss per 100 evaluation steps: 0.053097474464233414
Validation loss per 100 evaluation steps: 0.05430578602204621
Validation loss per 100 evaluation steps: 0.05479619081622739
Va

In [41]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         CUR       1.00      1.00      1.00       778
         DAT       0.90      0.92      0.91        79
         DRG       0.89      0.90      0.90      1905
         LOC       0.96      0.96      0.96      2210
        MISC       0.89      0.89      0.89      1069
         ORG       0.91      0.94      0.92      1966
         PER       0.97      0.97      0.97      2078
         WEP       0.89      0.94      0.92       632

   micro avg       0.93      0.94      0.94     10717
   macro avg       0.93      0.94      0.93     10717
weighted avg       0.93      0.94      0.94     10717



In [37]:
# ! pip install seqeval==0.0.12
# ! pip install unidecode

In [43]:
sentence = "I have been working in Spain for more than 10 years and I met Sara and Khaled, I love them. Also I was in the Guardia Civil office with Enrique"


inputs = tokenizer(sentence.split(),
                    is_split_into_words=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=MAX_LEN,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device, dtype = torch.long)
mask = inputs["attention_mask"].to(device, dtype = torch.long)

# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

# print(sentence.split())
# print(prediction)
prd  = list(zip(sentence.split(), prediction))
prd

[('I', 'O'),
 ('have', 'O'),
 ('been', 'O'),
 ('working', 'O'),
 ('in', 'O'),
 ('Spain', 'B-LOC'),
 ('for', 'O'),
 ('more', 'O'),
 ('than', 'O'),
 ('10', 'O'),
 ('years', 'O'),
 ('and', 'O'),
 ('I', 'O'),
 ('met', 'O'),
 ('Sara', 'B-PER'),
 ('and', 'O'),
 ('Khaled,', 'B-PER'),
 ('I', 'O'),
 ('love', 'O'),
 ('them.', 'O'),
 ('Also', 'O'),
 ('I', 'O'),
 ('was', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Guardia', 'B-ORG'),
 ('Civil', 'I-ORG'),
 ('office', 'O'),
 ('with', 'O'),
 ('Enrique', 'B-PER')]

In [39]:
# import os

# directory = "./bert-base-uncased-3epk"

# if not os.path.exists(directory):
#     os.makedirs(directory)

# # save vocabulary of the tokenizer
# tokenizer.save_vocabulary(directory)
# # save the model weights and its configuration file
# model.save_pretrained(directory)
# print('All files saved')

In [40]:
MAX_LEN

256