In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from ast import literal_eval
from tqdm import tqdm
import os

# specify GPU
device = torch.device("cuda")

In [2]:
DATA_PATH = '..'

config = {
    'MAX_LEN': 512,
    'batch_size': 5,
    'model_name': 'kaggle_bert_base_cased'
}

In [3]:
df = pd.read_csv(f'{DATA_PATH}/datasets/ner.csv')
df.POS = df.POS.apply(literal_eval)
df.Tag = df.Tag.apply(literal_eval)

In [4]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [5]:
tag_col = df['Tag'].tolist()

all_tags = sorted(list({t for tags in tag_col for t in tags}))

tag2idx = {}
for idx, entity in enumerate(all_tags):
    tag2idx[f'{entity}'] = idx
    
tag2idx['PAD'] = len(tag2idx)

tag2idx

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16,
 'PAD': 17}

In [6]:
df_sample = df[['Sentence', 'Tag']].sample(int(df.shape[0]*0.5)).reset_index(drop=True)

In [7]:
df_sample.shape

(23979, 2)

In [8]:
df_sample.head(1)

Unnamed: 0,Sentence,Tag
0,"Wednesday , three closely coordinated car bomb...","[B-tim, O, O, O, O, O, O, O, O, O, O, B-geo, O]"


In [9]:
del df

In [10]:
train_df, test_df = train_test_split(df_sample, test_size=0.25, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [11]:
train_df.shape, test_df.shape

((17984, 2), (5995, 2))

In [12]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [13]:
def tokenize_and_align_labels(sentence, tags, tokenizer):
    tokenized = tokenizer(sentence.split(' '), truncation=True, 
                          is_split_into_words=True)

    #align tokens and labels
    labels = []
    word_ids = tokenized.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(tag2idx[tags[word_id]])    
        else:
            labels.append(-100)
        prev_word_id = word_id

    tokenized["labels"] = labels
    return tokenized

In [14]:
def data_to_list(df):
    id_list = df.index.values.tolist()
    sentence_list = df.Sentence.values.tolist()
    tag_list = df.Tag.values.tolist()
    
    return id_list, sentence_list, tag_list

train_id_list, train_sentence_list, train_tag_list = data_to_list(train_df)
valid_id_list, valid_sentence_list, valid_tag_list = data_to_list(test_df)

In [15]:
class NerDataset(Dataset):
    def __init__(self, ner_data, ner_tags, tokenizer, tag_to_idx, 
                 data_type='test', **kwarg):
        self.ner_data = ner_data
        self.ner_tags = ner_tags
        self.tokenizer = tokenizer
        self.tag_to_idx = tag_to_idx
        self.max_lenght = config['MAX_LEN']
        self.data_type = data_type
        
    def __len__(self):
        return len(self.ner_data)

    def __getitem__(self, item):
        tokenized = tokenize_and_align_labels(self.ner_data[item], self.ner_tags[item], tokenizer)
        
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        labels = tokenized['labels']

        #pad seq
        pad_len = self.max_lenght - len(input_ids)
        input_ids = input_ids + [2] * pad_len
        attention_mask = attention_mask + [0] * pad_len

        if self.data_type != 'test':
            labels = labels + [2] * pad_len
        else:
            labels = 1

        return {'input_ids': torch.tensor(input_ids, dtype=torch.long),
                'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
                'target': torch.tensor(labels, dtype=torch.long)
               }

In [16]:
train_dataset = NerDataset(ner_data=train_sentence_list, 
                           ner_tags=train_tag_list, 
                           tokenizer=tokenizer, 
                           tag_to_idx=tag2idx, 
                           data_type='train')

val_dataset = NerDataset(ner_data=valid_sentence_list,
                         ner_tags=valid_tag_list,
                         tokenizer=tokenizer,
                         tag_to_idx=tag2idx,
                         data_type='valid')

In [17]:
print(train_sentence_list[0])
print(train_tag_list[0])
train_dataset.__getitem__(0)

Health Minister Jose Ramon Balaguer told reporters in Havana Friday that Moore 's movie , screened recently at the Cannes film festival , will show the world the humaneness of the Cuban health care system .
['O', 'O', 'B-per', 'I-per', 'I-per', 'O', 'O', 'O', 'B-geo', 'B-tim', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O']


{'input_ids': tensor([  101,  3225,  2110,  7438, 19437, 18757, 18974, 10232,  1500, 13509,
          1107, 16092,  5286,  1115,  4673,   112,   188,  2523,   117, 12468,
          3055,  1120,  1103, 15451,  1273,  3782,   117,  1209,  1437,  1103,
          1362,  1103,  1769,  7582,  3954,  1104,  1103,  9383,  2332,  1920,
          1449,   119,   102,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,   

In [18]:
train_data_loader = DataLoader(train_dataset,
                               batch_size=config['batch_size'],
                               shuffle=False)

val_data_loader = DataLoader(val_dataset,
                             batch_size=config['batch_size'],
                             shuffle=False)

<h1>Train model

In [19]:
def train(data_loader, model, optimizer):
    train_loss = 0
#     for idx, dataset in enumerate(tqdm(data_loader, total=len(data_loader))):
    for idx, dataset in enumerate(tqdm(data_loader, total=len(data_loader))):
        batch_input_ids = dataset['input_ids'].to(device, dtype=torch.long)
        batch_att_mask = dataset['attention_mask'].to(device, dtype=torch.long)
#         batch_token_type_id = dataset['token_id'].to(device, dtype=torch.long)
        batch_target = dataset['target'].to(device, dtype=torch.long)
        
        output = model(batch_input_ids,
                       token_type_ids=None,
                       attention_mask=batch_att_mask,
                       labels=batch_target)
        
        step_loss = output[0]
        prediction = output[1]
        
        step_loss.sum().backward()
        optimizer.step()
        train_loss += step_loss
        optimizer.zero_grad()
        
    return train_loss.sum()
        

In [20]:
def evaluate(data_loader, model):
    model.eval()
    
    eval_loss = 0
    predictions = np.array([], dtype=np.int64).reshape(0, config['MAX_LEN'])
    true_labels = np.array([], dtype=np.int64).reshape(0, config['MAX_LEN'])
    
    with torch.no_grad():
        for idx, dataset in enumerate(tqdm(data_loader, total=len(data_loader))):
            batch_input_ids = dataset['input_ids'].to(device, dtype=torch.long)
            batch_att_mask = dataset['attention_mask'].to(device, dtype=torch.long)
            batch_target = dataset['target'].to(device, dtype=torch.long)
            
            output = model(batch_input_ids,
                           token_type_ids=None,
                           attention_mask=batch_att_mask,
                           labels=batch_target)
            
            step_loss = output[0]
            eval_prediction = output[1]
            
            eval_loss += step_loss
            
            eval_prediction = np.argmax(eval_prediction.detach().to('cpu').numpy(), 
                                        axis=2)
            actual = batch_target.to('cpu').numpy()
            
            predictions = np.concatenate((predictions, eval_prediction), axis=0)
            true_labels = np.concatenate((true_labels, actual), axis=0)
            
            return eval_loss.sum(), predictions, true_labels

In [21]:
#tag2idx without PAD token
model = transformers.BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(tag2idx)-1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
if not os.path.exists('../bert_output'):
    os.mkdir('../bert_output')
    
epoch = 3
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

best_eval_loss = 1000000
for i in range(epoch):
    train_loss = train(data_loader=train_data_loader,
                       model=model,
                       optimizer=optimizer)
    eval_loss, eval_predictions, true_labels = evaluate(data_loader=val_data_loader,
                                                        model=model)
    
    print(f"Epoch {i} , Train loss: {train_loss}, Eval loss: {eval_loss}")

    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss           

        print("Saving the model")
        torch.save(model.state_dict(), f'../bert_output/{config["model_name"]}')

  6%|████▉                                                                          | 223/3597 [01:10<17:37,  3.19it/s]