In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
data = pd.read_csv('../../Resources/data/ner/entity-annotated-corpus/ner_dataset.csv',
                   encoding='latin1').fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        
        agg_func = lambda x: [(w, p, t) for w, p, t in zip(x['Word'].values.tolist(),
                                                           x['POS'].values.tolist(),
                                                           x['Tag'].values.tolist())]
        
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped['Sentece: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [4]:
sg = SentenceGetter(data)

In [5]:
sentences = [[word[0] for word in sentence] for sentence in sg.sentences]
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [6]:
labels = [[l[2] for l in sentence] for sentence in sg.sentences]
labels[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O']

In [7]:
tag_values = list(data['Tag'].unique())
tag_values.append('PAD')
tag_values

['O',
 'B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat',
 'PAD']

In [8]:
tag2idx = {t: i for i, t in enumerate(tag_values)}
tag2idx

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16,
 'PAD': 17}

In [9]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import SequentialSampler

import transformers

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

Using TensorFlow backend.


'1.2.0'

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce 940MX'

In [11]:
MAX_LEN = 32
batch_size = 8

In [12]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x163d1eca408>

In [13]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [14]:
tokenized_sentece_labels = [
    tokenize_and_preserve_labels(sent, label)
    for sent, label in zip(sentences, labels)
]

In [15]:
tokenized_texts = [tok_label_pair[0] for tok_label_pair in tokenized_sentece_labels]
tokenized_labels = [tok_label_pair[1] for tok_label_pair in tokenized_sentece_labels]

In [16]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [17]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [18]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [19]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [20]:
tr_inputs = torch.tensor(tr_inputs).to(torch.int64)
val_inputs = torch.tensor(val_inputs).to(torch.int64)
tr_tags = torch.tensor(tr_tags).to(torch.int64)
val_tags = torch.tensor(val_tags).to(torch.int64)
tr_masks = torch.tensor(tr_masks).to(torch.int64)
val_masks = torch.tensor(val_masks).to(torch.int64)

In [21]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [22]:
model = transformers.BertForTokenClassification.from_pretrained(
    'bert-base-cased',
    num_labels=len(tag2idx),
    output_attentions=False,
    output_hidden_states=False,
)

In [23]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [24]:
param_optimizer = list(model.classifier.named_parameters())
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]

In [25]:
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

In [26]:
from transformers import WarmupLinearSchedule as get_linear_schedule_with_warmup

In [27]:
epochs = 1
max_grad_norm = 1.0
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            warmup_steps=0,
                                            t_total=total_steps)

In [28]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import f1_score
from tqdm import trange

In [29]:
loss_values, val_loss_values = [], []

for _ in trange(epochs, desc='Epoch'):
    
    ## ========================
    
    ##      Training
    
    ## ========================
    
    ## Put the model into training mode
    model.train()
    
    ## Resetting loss for this epoch
    total_loss = 0
    
    ## Training loop
    for step, batch in enumerate(train_dataloader):
        
        ## Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        ## Clearing previously calculated gradients
        model.zero_grad()
        
        ## Forward pass
        ## This will return loss, rather than the model output
        ## Since we have passed the labels
        
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        
        ## Get loss
        loss = outputs[0]
        
        ## Back-prop
        loss.backward()
        
        ## Track train loss
        total_loss += loss.item()
        
        ## Clipping norm gradient to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                       max_norm=max_grad_norm)
        
        ## Update parameters
        optimizer.step()
        
        ## Update LR
        scheduler.step()
        
    
    ## Calculate avg. train loss over training data
    avg_train_loss = total_loss / len(train_dataloader)
    print('* Avg. Train Loss: ', round(avg_train_loss, 4))
    
    ## Store loss to draw curves
    loss_values.append(avg_train_loss)
    
    
    ## =====================
    
    ##      Validation
    
    ## =====================
    
    
    ## Put model into eval mode
    model.eval()
    
    ## Reset validation loss for this epoch
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        ## Don't store/compute any gradients
        ## Saving memory, speed-up processing
        with torch.no_grad():
            ## Forward pass, calculate logit preds
            ## Will return logits rather than loss, since we won't provide labels
            
            outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
            
        ## Move logits, labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        ## Calculate acc for this batch
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
        
    eval_loss = eval_loss / len(valid_dataloader)
    val_loss_values.append(eval_loss)
    print('* Validation Loss: ', round(eval_loss, 4))
    
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    
    print('* Validation Acc: ', accuracy_score(pred_tags, valid_tags))
    print('* Validation F1-Score: ', f1_score(pred_tags, valid_tags))
    print()

Epoch:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]

* Avg. Train Loss:  1.0175
* Validation Loss:  0.8483
* Validation Acc:  0.8382741244402317
* Validation F1-Score:  0.00016864828400371025



Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 1/1 [2:07:54<00:00, 7674.99s/it]


In [30]:
# import matplotlib.pyplot as plt
# %matplotlib inline

# import seaborn as sns


# ##  Increase the plot size and font size.
# sns.set(font_scale=1.5)
# plt.rcParams["figure.figsize"] = (12, 6)

# ## Plot the learning curve.
# plt.plot(loss_values, 'b-o', label="training loss")
# plt.plot(validation_loss_values, 'r-o', label="validation loss")

# ## Label the plot.
# plt.title("Learning curve")
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.legend()

# plt.tight_layout()

In [None]:
sentence = "The London Bridge has fallen"
tokenized_sentence = tokenizer.encode(sentence)
input_ids = torch.tensor([tokenized_sentence]).to(torch.int64).cuda()

with torch.no_grad():
    output = model(input_ids)
    
label_idx = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []

for token, label_id in zip(tokens, label_idx[0]):
    if token.startswith('##'):
        new_token[-1] = new_token[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_id])
        new_tokens.append(token)
        
for token, label in zip(new_tokens, new_labels):
    print('{} \t {}'.format(label, token))