In [1]:
# https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=Eh3ckSO0YMZW

In [2]:
# ! pip install pandas 
# ! pip install scikit-learn
# ! pip install seqeval==0.0.12
# ! pip install unidecode

In [3]:
# Load dataset
import pandas as pd 

df = pd.read_csv('./NER_DS_SPANISH/spanish_ner_dataset_INCIBE.csv', encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,sentence_idx,word,tag
0,0,0,Juan,B-PERSON
1,1,0,vive,O
2,2,0,en,O
3,3,0,España,B-GPE-LOCATION
4,4,0,.,O


In [4]:
len(set(df.sentence_idx.tolist()))

48791

In [5]:
# Split dataset:
split_thresh = df['sentence_idx'].max() * 0.9
df_train, df_valid = df[df['sentence_idx'] < split_thresh], df[df['sentence_idx'] >= split_thresh]
len(df_train), len(df_valid)

(353854, 45707)

In [6]:
agg_func = lambda s: [ [w,t] for w,t in zip(s["word"].values.tolist(),s["tag"].values.tolist())]

In [7]:
x_train_grouped = df_train.groupby("sentence_idx").apply(agg_func)
x_valid_grouped = df_valid.groupby("sentence_idx").apply(agg_func)

x_train_sentences = [[s[0] for s in sent] for sent in x_train_grouped.values]
x_valid_sentences = [[s[0] for s in sent] for sent in x_valid_grouped.values]

x_train_tags = [[t[1] for t in tag] for tag in x_train_grouped.values]
x_valid_tags = [[t[1] for t in tag] for tag in x_valid_grouped.values]

In [8]:
tag_list = df['tag'].unique()
label_map = {label: i for i, label in enumerate(tag_list)}
label_map_inv = {i: label for i, label in enumerate(tag_list)}
num_labels = len(tag_list) + 1
num_labels

38

In [9]:
print("Number of tags: {}".format(len(df.tag.unique())))
frequencies = df.tag.value_counts()
frequencies

Number of tags: 37


O                 299007
I-PRODUCTS         35923
B-PRODUCTS         16242
I-ORGANIZATION      7190
B-NUMBERS_C         6936
I-NUMBERS_C         5874
B-ORGANIZATION      5636
I-GPE-LOCATION      4772
B-GPE-LOCATION      3039
I-DATE              2550
B-DATE              1619
I-QUANTITIES        1073
I-FACILITIES        1062
B-QUANTITIES         969
I-PERSON             938
I-ART                932
B-PERSON             856
I-EVENTS             658
I-LAW                655
I-TIME               540
I-MONEY              500
B-TIME               477
B-FACILITIES         374
B-ART                295
B-MONEY              286
I-LOCATION           219
B-EVENTS             216
B-LOCATION           177
B-LANGUAGE           165
B-NORP               164
B-LAW                103
B-NUMBERS_O           49
I-LANGUAGE            24
I-GROUP               19
I-NORP                10
B-GROUP                8
I-NUMBERS_O            4
Name: tag, dtype: int64

In [10]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:] not in tags.keys():
            tags[tag[2:]] = count
        else:
            tags[tag[2:]] += count
    continue

sorted(tags.items(), key=lambda x: x[1], reverse=True)

[('PRODUCTS', 52165),
 ('ORGANIZATION', 12826),
 ('NUMBERS_C', 12810),
 ('GPE-LOCATION', 7811),
 ('DATE', 4169),
 ('QUANTITIES', 2042),
 ('PERSON', 1794),
 ('FACILITIES', 1436),
 ('ART', 1227),
 ('TIME', 1017),
 ('EVENTS', 874),
 ('MONEY', 786),
 ('LAW', 758),
 ('LOCATION', 396),
 ('LANGUAGE', 189),
 ('NORP', 174),
 ('NUMBERS_O', 53),
 ('GROUP', 27)]

In [11]:
entities_to_remove = ["B-TIME", "I-TIME",
                      "B-NUMBERS_O", "I-NUMBERS_O",
                      "B-NORP", "I-NORP",
                      "B-LAW", "I-LAW",
                      "B-NUMBERS_C", "I-NUMBERS_C",
                      "B-DATE", "I-DATE",
                      "B-ART", "I-ART", 
                     ]
df = df[~df.tag.isin(entities_to_remove)]
df.head()

Unnamed: 0.1,Unnamed: 0,sentence_idx,word,tag
0,0,0,Juan,B-PERSON
1,1,0,vive,O
2,2,0,en,O
3,3,0,España,B-GPE-LOCATION
4,4,0,.,O


In [12]:
print("Number of tags: {}".format(len(df.tag.unique())))
frequencies = df.tag.value_counts()
frequencies

Number of tags: 23


O                 299007
I-PRODUCTS         35923
B-PRODUCTS         16242
I-ORGANIZATION      7190
B-ORGANIZATION      5636
I-GPE-LOCATION      4772
B-GPE-LOCATION      3039
I-QUANTITIES        1073
I-FACILITIES        1062
B-QUANTITIES         969
I-PERSON             938
B-PERSON             856
I-EVENTS             658
I-MONEY              500
B-FACILITIES         374
B-MONEY              286
I-LOCATION           219
B-EVENTS             216
B-LOCATION           177
B-LANGUAGE           165
I-LANGUAGE            24
I-GROUP               19
B-GROUP                8
Name: tag, dtype: int64

In [13]:
tag_list = df['tag'].unique()
label_map = {label: i for i, label in enumerate(tag_list)}
label_map_inv = {i: label for i, label in enumerate(tag_list)}
num_labels = len(tag_list) + 1
(num_labels/2)-1

11.0

In [14]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:] not in tags.keys():
            tags[tag[2:]] = count
        else:
            tags[tag[2:]] += count
    continue

sorted(tags.items(), key=lambda x: x[1], reverse=True)

[('PRODUCTS', 52165),
 ('ORGANIZATION', 12826),
 ('GPE-LOCATION', 7811),
 ('QUANTITIES', 2042),
 ('PERSON', 1794),
 ('FACILITIES', 1436),
 ('EVENTS', 874),
 ('MONEY', 786),
 ('LOCATION', 396),
 ('LANGUAGE', 189),
 ('GROUP', 27)]

In [29]:
sum(tags.values())

80346

In [16]:
labels_to_ids = {k: v for v, k in enumerate(df.tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(df.tag.unique())}
labels_to_ids

{'B-PERSON': 0,
 'O': 1,
 'B-GPE-LOCATION': 2,
 'I-PERSON': 3,
 'B-ORGANIZATION': 4,
 'I-ORGANIZATION': 5,
 'B-MONEY': 6,
 'I-GPE-LOCATION': 7,
 'B-LANGUAGE': 8,
 'B-PRODUCTS': 9,
 'I-PRODUCTS': 10,
 'B-FACILITIES': 11,
 'I-FACILITIES': 12,
 'B-EVENTS': 13,
 'I-EVENTS': 14,
 'B-GROUP': 15,
 'I-GROUP': 16,
 'I-MONEY': 17,
 'B-QUANTITIES': 18,
 'I-QUANTITIES': 19,
 'B-LOCATION': 20,
 'I-LOCATION': 21,
 'I-LANGUAGE': 22}

In [17]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
df = df.fillna(method='ffill')
df.head()

Unnamed: 0.1,Unnamed: 0,sentence_idx,word,tag
0,0,0,Juan,B-PERSON
1,1,0,vive,O
2,2,0,en,O
3,3,0,España,B-GPE-LOCATION
4,4,0,.,O


In [18]:
# let's create a new column called "sentence" which groups the words by sentence 
df['sentence'] = df[['sentence_idx','word','tag']].groupby(['sentence_idx'])['word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
df['word_labels'] = df[['sentence_idx','word','tag']].groupby(['sentence_idx'])['tag'].transform(lambda x: ','.join(x))
df.head()

Unnamed: 0.1,Unnamed: 0,sentence_idx,word,tag,sentence,word_labels
0,0,0,Juan,B-PERSON,Juan vive en España .,"B-PERSON,O,O,B-GPE-LOCATION,O"
1,1,0,vive,O,Juan vive en España .,"B-PERSON,O,O,B-GPE-LOCATION,O"
2,2,0,en,O,Juan vive en España .,"B-PERSON,O,O,B-GPE-LOCATION,O"
3,3,0,España,B-GPE-LOCATION,Juan vive en España .,"B-PERSON,O,O,B-GPE-LOCATION,O"
4,4,0,.,O,Juan vive en España .,"B-PERSON,O,O,B-GPE-LOCATION,O"


In [19]:
df = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,sentence,word_labels
0,Juan vive en España .,"B-PERSON,O,O,B-GPE-LOCATION,O"
1,Prueba de etiquetado,"O,O,O"
2,"Pedro Sánchez Pérez - Castejón , presidente de...","B-PERSON,I-PERSON,I-PERSON,I-PERSON,I-PERSON,O..."
3,fotocenter.es – Imprimir tus mejores recuerdos...,"B-ORGANIZATION,O,O,O,O,O,O,O,O,O"
4,Skip to content,"O,O,O"


In [20]:
# Remove hidden split chat
df['sentence'] = df['sentence'].str.replace('\\xa0','', regex=True)

In [21]:
# df['sentence'] = df['sentence'].str.lower()

In [22]:
len(df)

33544

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [24]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 2
EPOCHS = 2
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained("dccuchile/bert-base-spanish-wwm-cased",
                                              do_lower_case=False)# bert-base-multilingual-uncased", do_lower_case=True)

In [25]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [26]:
train_size = 0.8
train_dataset = df.sample(frac=train_size,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (33544, 2)
TRAIN Dataset: (26835, 2)
TEST Dataset: (6709, 2)


In [33]:
with open('./NER_DS_SPANISH/test_set_sent.txt','w', encoding='utf-8') as wrt:
    wrt.write('\n'.join(test_dataset.sentence.tolist()))


In [28]:
# train_dataset.iloc[13518].word_labels

In [30]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[80]["input_ids"]), training_set[80]["labels"]):
    # print('{0:10}  {1}'.format(token, label))
    pass

In [31]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [32]:
from torch import cuda
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification


device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [34]:
# model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
# model.to(device)

from transformers import AutoModel, AutoTokenizer 

model = BertForTokenClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased",# "bert-base-multilingual-uncased", 
                                                   num_labels=len(labels_to_ids), 
                                                   ignore_mismatched_sizes=True)
model.to(device);

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base

In [35]:
inputs = training_set[0]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)
# labels = labels.type(torch.LongTensor)

input_ids = input_ids.to(device, dtype = torch.long)
attention_mask = attention_mask.to(device, dtype = torch.long)
labels = labels.to(device, dtype = torch.long)


In [36]:
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.9567, device='cuda:0', grad_fn=<NllLossBackward0>)

In [37]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 256, 23])

In [38]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [39]:
import os

def save_model(tk, ml, epk):    

    directory = "./beto_es_cased"
    
    p  = os.path.join(directory,f"epoch_{str(epk)}")
    os.makedirs(p, exist_ok=True)
    
    # save vocabulary of the tokenizer
    tk.save_vocabulary(p)
    
    # save the model weights and its configuration file
    ml.save_pretrained(p)
    print('All files saved')


In [41]:
# Defining the training function on the 80% of the dataset for tuning the bert model
# def train(epoch):

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
        
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train();

    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
#         labels = labels.type(torch.LongTensor).to(device)

        someoutput = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss, tr_logits = someoutput[0], someoutput[1]        
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}, ID: {idx}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    save_model(tk=tokenizer, ml = model, epk=epoch)

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    
    

Training epoch: 1
Training loss per 100 training steps: 3.061131477355957, ID: 0
Training loss per 100 training steps: 1.1223724056588542, ID: 100
Training loss per 100 training steps: 0.8686017387393695, ID: 200
Training loss per 100 training steps: 0.749726538469031, ID: 300
Training loss per 100 training steps: 0.6695765233656712, ID: 400
Training loss per 100 training steps: 0.6147739472287798, ID: 500
Training loss per 100 training steps: 0.5736479909250671, ID: 600
Training loss per 100 training steps: 0.5342271324863574, ID: 700
Training loss per 100 training steps: 0.513883754387777, ID: 800
Training loss per 100 training steps: 0.4938842484326444, ID: 900
Training loss per 100 training steps: 0.47428736023020585, ID: 1000
Training loss per 100 training steps: 0.45876516616707846, ID: 1100
Training loss per 100 training steps: 0.44713820607093574, ID: 1200
Training loss per 100 training steps: 0.43483819077500097, ID: 1300
Training loss per 100 training steps: 0.423912421511692

In [47]:
for idx, batch in enumerate(training_loader):
    ids = batch['input_ids'].to(device, dtype = torch.long)
    mask = batch['attention_mask'].to(device, dtype = torch.long)
    labels = batch['labels'].to(device, dtype = torch.long)
    #         labels = labels.type(torch.LongTensor).to(device)

    someoutput = model(input_ids=ids, attention_mask=mask, labels=labels)
    loss, tr_logits = someoutput[0], someoutput[1]        
    tr_loss += loss.item()

    nb_tr_steps += 1
    nb_tr_examples += labels.size(0)

    if idx % 100==0:
        loss_step = tr_loss/nb_tr_steps
        print(f"Training loss per 100 training steps: {loss_step}, ID: {idx}")

    # compute training accuracy
    flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
    active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

    # only compute accuracy at active labels
    active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
    #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

    labels = torch.masked_select(flattened_targets, active_accuracy)
    predictions = torch.masked_select(flattened_predictions, active_accuracy)

    tr_labels.extend(labels)
    tr_preds.extend(predictions)

    tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    tr_accuracy += tmp_tr_accuracy

    # gradient clipping
    torch.nn.utils.clip_grad_norm_(
        parameters=model.parameters(), max_norm=MAX_GRAD_NORM
    )

    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
save_model(tk=tokenizer, ml = model, epk=3)

Training loss per 100 training steps: 0.18786422765764954, ID: 0
Training loss per 100 training steps: 0.186547505759654, ID: 100
Training loss per 100 training steps: 0.18549549811076793, ID: 200
Training loss per 100 training steps: 0.18364159035215846, ID: 300
Training loss per 100 training steps: 0.1825051229602512, ID: 400
Training loss per 100 training steps: 0.18117876747319403, ID: 500
Training loss per 100 training steps: 0.1800638874887725, ID: 600
Training loss per 100 training steps: 0.17913265686496305, ID: 700
Training loss per 100 training steps: 0.1778258933199844, ID: 800
Training loss per 100 training steps: 0.17685511325283795, ID: 900
Training loss per 100 training steps: 0.1757336467892481, ID: 1000
Training loss per 100 training steps: 0.17470867696876052, ID: 1100
Training loss per 100 training steps: 0.17390276988994408, ID: 1200
Training loss per 100 training steps: 0.17301077696377992, ID: 1300
Training loss per 100 training steps: 0.17167796911097266, ID: 140

In [50]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval();
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            someoutput = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = someoutput[0], someoutput[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [51]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.02338421903550625
Validation loss per 100 evaluation steps: 0.2665409466764938
Validation loss per 100 evaluation steps: 0.2996994985880148
Validation loss per 100 evaluation steps: 0.31488958498145114
Validation loss per 100 evaluation steps: 0.30534930842965025
Validation loss per 100 evaluation steps: 0.29187828005164185
Validation loss per 100 evaluation steps: 0.28675054938511785
Validation loss per 100 evaluation steps: 0.2836471598551746
Validation loss per 100 evaluation steps: 0.28850552096513327
Validation loss per 100 evaluation steps: 0.29297508056138327
Validation loss per 100 evaluation steps: 0.28448893748907095
Validation loss per 100 evaluation steps: 0.2791695355682581
Validation loss per 100 evaluation steps: 0.27426035704834806
Validation loss per 100 evaluation steps: 0.27162374630618286
Validation loss per 100 evaluation steps: 0.2660222150060522
Validation loss per 100 evaluation steps: 0.26129283363122763
Validation lo

In [52]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      EVENTS       0.36      0.23      0.28        39
  FACILITIES       0.46      0.44      0.45        66
GPE-LOCATION       0.75      0.76      0.75       550
       GROUP       0.00      0.00      0.00         4
    LANGUAGE       0.56      0.62      0.59        24
    LOCATION       0.50      0.46      0.48        37
       MONEY       0.80      0.80      0.80        45
ORGANIZATION       0.57      0.74      0.64       961
      PERSON       0.77      0.79      0.78       171
    PRODUCTS       0.73      0.73      0.73      2653
  QUANTITIES       0.77      0.83      0.80       167

   micro avg       0.69      0.73      0.71      4717
   macro avg       0.57      0.58      0.57      4717
weighted avg       0.70      0.73      0.71      4717



In [59]:
import nltk 

text = "Mañana iré a la empresa Alesida a ver la posibilidad de comprar un piso grande con ventanales gigantes. También puedes comprar un iPhone modelo año 2022"
text = 'Me llamo Mar Jose. Vivo cerca del Guardia Civil, que es uno de los lugares más famosos de España. ' \
       'Ahi se venden grabanzo, barras de chocolate y Coca-Cola. La gente habla principalmente español y francés. ' \
       '5 euros ¡Asistiremos al concierto de Wesam 2023 que sería increíble!. ' \
       'Vamos a quedarnos a las 18:00 de la noche!'

inputs = tokenizer(nltk.word_tokenize(text),
                    is_split_into_words=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=256,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device, dtype = torch.long)
mask = inputs["attention_mask"].to(device, dtype = torch.long)

# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

# print(sentence.split())
# print(prediction)
prd  = list(zip(nltk.word_tokenize(text), prediction))
prd

[('Me', 'O'),
 ('llamo', 'O'),
 ('Mar', 'B-PERSON'),
 ('Jose', 'I-PERSON'),
 ('.', 'O'),
 ('Vivo', 'O'),
 ('cerca', 'O'),
 ('del', 'O'),
 ('Guardia', 'B-FACILITIES'),
 ('Civil', 'I-ORGANIZATION'),
 (',', 'O'),
 ('que', 'O'),
 ('es', 'O'),
 ('uno', 'O'),
 ('de', 'O'),
 ('los', 'O'),
 ('lugares', 'O'),
 ('más', 'O'),
 ('famosos', 'O'),
 ('de', 'O'),
 ('España', 'B-GPE-LOCATION'),
 ('.', 'O'),
 ('Ahi', 'O'),
 ('se', 'O'),
 ('venden', 'O'),
 ('grabanzo', 'B-PRODUCTS'),
 (',', 'O'),
 ('barras', 'B-PRODUCTS'),
 ('de', 'I-PRODUCTS'),
 ('chocolate', 'I-PRODUCTS'),
 ('y', 'O'),
 ('Coca-Cola', 'B-PRODUCTS'),
 ('.', 'O'),
 ('La', 'O'),
 ('gente', 'O'),
 ('habla', 'O'),
 ('principalmente', 'O'),
 ('español', 'B-LANGUAGE'),
 ('y', 'O'),
 ('francés', 'B-LANGUAGE'),
 ('.', 'O'),
 ('5', 'B-MONEY'),
 ('euros', 'I-MONEY'),
 ('¡Asistiremos', 'O'),
 ('al', 'O'),
 ('concierto', 'O'),
 ('de', 'O'),
 ('Wesam', 'B-EVENTS'),
 ('2023', 'I-EVENTS'),
 ('que', 'O'),
 ('sería', 'O'),
 ('increíble', 'O'),
 ('!', 'O'

In [60]:
save_model(tk=tokenizer, ml=model, epk='final')

All files saved
