### The NER data formatting process is in kaggle's kernel:
### https://www.kaggle.com/xiaonanji/coleridge-initiative/edit

The kaggle kernel has limited memory to train bert so I have to do it locally

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import re
import random

torch.__version__, torch.cuda.is_available(), transformers.__version__

('1.7.1+cu110', True, '4.2.2')

In [2]:
import subprocess as sp
import os

def get_gpu_memory():
    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]

    ACCEPTABLE_AVAILABLE_MEMORY = 1024
    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    print(memory_free_values)
    return memory_free_values

get_gpu_memory()

[8031]


[8031]

In [3]:
training_data = pd.read_csv("C:\\Users\\stick\\kaggle\\ner_training.csv",sep=",",encoding="utf8", header=None, keep_default_na=False, na_values=[], names=['Sentence', 'Word', 'Tag'])
training_data.shape, training_data

((21363823, 3),
           Sentence      Word Tag
 0         S2704290       The   O
 1         S2704290   Centers   O
 2         S2704290       for   O
 3         S2704290   Disease   O
 4         S2704290   Control   O
 ...            ...       ...  ..
 21363818  S1137486  datasets   O
 21363819  S1137486         [   O
 21363820  S1137486        15   O
 21363821  S1137486         ]   O
 21363822  S1137486         .   O
 
 [21363823 rows x 3 columns])

In [4]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(),
                                                     s['Tag'].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped['s{0}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
getter = SentenceGetter(training_data)

In [6]:
sentences = [[tu[0] for tu in sentence] for sentence in getter.sentences]
print(sentences[0])
labels = [[tu[1] for tu in sentence] for sentence in getter.sentences]
print(labels[0])

['This', 'study', 'used', 'data', 'from', 'the', 'National', 'Education', 'Longitudinal', 'Study', '(', 'NELS:88', ')', 'to', 'examine', 'the', 'effects', 'of', 'dual', 'enrollment', 'programs', 'for', 'high', 'school', 'students', 'on', 'college', 'degree', 'attainment', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-D', 'I-D', 'I-D', 'I-D', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [7]:
len(sentences), len(labels)

(434805, 434805)

In [9]:
tag_values = list(training_data.Tag.unique())
tag_values.append('PAD')
tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag2idx)
print(tag_values)

{'O': 0, 'B-D': 1, 'I-D': 2, 'PAD': 3}
['O', 'B-D', 'I-D', 'PAD']


In [10]:
MAX_LEN = 100
BATCH_SIZE = 32
PRETRAIN_MODEL = 'bert-base-cased'
VALIDATE_PERCENT = 0.2
TRUNCATING_TYPE = 'pre'
PADDING_TYPE = 'post'
epochs = 2
max_grad_norm = 1.0

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
print(device)
print(n_gpu)

cuda
1


In [12]:
tokenizer = BertTokenizer.from_pretrained(PRETRAIN_MODEL, do_lower_case=False)

In [13]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    
    for word, label in zip(sentence, text_labels):
        try:
            tokenized_word = tokenizer.tokenize(word)
            n_subwords = len(tokenized_word)
            tokenized_sentence.extend(tokenized_word)
            labels.extend([label]*n_subwords)
        except AttributeError:
            print(word)
            print(sentence)
            print(text_labels)
            
    return tokenized_sentence, labels

In [14]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)
]

In [15]:
len(tokenized_texts_and_labels)

434805

In [16]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [17]:
input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], 
    maxlen=MAX_LEN, 
    dtype='long', 
    value=0.0, 
    truncating=TRUNCATING_TYPE, 
    padding=PADDING_TYPE
)
tags = pad_sequences(
    [[tag2idx.get(l) for l in lab] for lab in labels], 
    maxlen=MAX_LEN, 
    dtype='long',
    value=tag2idx["PAD"],
    truncating=TRUNCATING_TYPE,
    padding=PADDING_TYPE
)

In [188]:
tokenizer.convert_tokens_to_ids(['difficulties', 'difficult', 'If', 'if', 'AD', '##NI', 'RA', '##V', '##LT'])

[7866, 2846, 1409, 1191, 5844, 27451, 26547, 2559, 26909]

In [18]:
input_ids.shape

(434805, 100)

In [19]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [20]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(
    input_ids, tags, attention_masks, 
    random_state=2021,
    test_size=VALIDATE_PERCENT
)

In [21]:
tr_inputs.shape, tr_tags.shape, len(tr_masks)

((347844, 100), (347844, 100), 347844)

In [22]:
val_inputs.shape, val_tags.shape, len(val_masks)

((86961, 100), (86961, 100), 86961)

In [23]:
tr_inputs[3]

array([10605,  6824,  4001,  1132,  7160,  1114,  5021,  2942,   117,
        1423,  3107, 19755,  1132,  2602,  1107,  1843,  2448,   117,
       19755,  2766,  1104,   123,   118,  3527, 11080,  1132,  2533,
        1118,  4348,  2448,  1105, 19755,  2766,  1104,   124,   116,
        3527, 11080,  1132,  3597,  1114, 13552,  2942,   113,  1267,
        6177,   123,   119,   125,   157, 19366,  1204, 15577, 17580,
        4800,  1111, 14441,  1372, 14256,   114,   119,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

In [195]:
# tr_inputs = torch.tensor(tr_inputs)
# val_inputs = torch.tensor(val_inputs)
# tr_tags = torch.tensor(tr_tags)
# val_tags = torch.tensor(val_tags)
# tr_masks = torch.tensor(tr_masks)
# val_masks = torch.tensor(val_masks)

In [24]:
class SeanDataset:
    def __init__(self, word_tokens, masks, tags):
        self.word_tokens = word_tokens
        self.masks = masks
        self.tags = tags
        
    def __len__(self):
        return len(self.word_tokens)
    
    def __getitem__(self, item):
        word_tokens = self.word_tokens[item]
        masks = self.masks[item]
        tags = self.tags[item]
        return {
            "word_tokens": torch.tensor(word_tokens, dtype=torch.long),
            "masks": torch.tensor(masks, dtype=torch.float),
            "tags": torch.tensor(tags, dtype=torch.long)
        }

In [25]:
# train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)

train_dataset = SeanDataset(
    word_tokens=tr_inputs,
    masks=tr_masks,
    tags=tr_tags
)
    
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
)

# valid_data = TensorDataset(val_inputs, val_masks, val_tags)
# valid_sampler = SequentialSampler(valid_data)
# valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

valid_dataset = SeanDataset(
    word_tokens=val_inputs,
    masks=val_masks,
    tags=val_tags
)
    
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
)


In [26]:
len(train_dataloader)

10871

In [27]:
for d in train_dataloader:
    print(d)
    break

{'word_tokens': tensor([[ 1335,   139, 15928,  ...,     0,     0,     0],
        [ 1109,  2812,   118,  ...,     0,     0,     0],
        [ 1370,  1859,   117,  ...,     0,     0,     0],
        ...,
        [ 1188,  2860,  1110,  ...,     0,     0,     0],
        [  138,  8362, 12416,  ...,     0,     0,     0],
        [ 1130,  2943,   123,  ...,     0,     0,     0]]), 'masks': tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]]), 'tags': tensor([[0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3],
        ...,
        [0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3]])}


In [28]:
model = BertForTokenClassification.from_pretrained(
    PRETRAIN_MODEL,
    num_labels=len(tag2idx),
    output_attentions=False,
    output_hidden_states=False
)
torch.cuda.empty_cache()
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [29]:
get_gpu_memory()

[7108]


[7108]

In [30]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [31]:
total_steps = len(train_dataloader) * epochs
print(total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

21742


In [32]:
from seqeval.metrics import f1_score, accuracy_score

In [33]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [34]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

loss_values, validation_loss_values = [], []

for epoch_i in trange(epochs, desc='Epoch'):
    ########################################
    ## Training
    ########################################
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
#         batch = tuple(t.to(device) for t in batch)
#         b_input_ids, b_input_mask, b_labels = batch
        
#         b_input_ids = torch.tensor(b_input_ids).to(torch.int64)
#         b_input_mask = torch.tensor(b_input_mask).to(torch.int64)
#         b_labels = torch.tensor(b_labels).to(torch.int64)
        
        b_input_ids = batch['word_tokens'].to(device).to(torch.int64)
        b_input_mask = batch['masks'].to(device).to(torch.int64)
        b_labels = batch['tags'].to(device).to(torch.int64)
    
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print()
    print("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Training epoch took: {:}".format(training_time))
    
#     avg_train_loss = total_loss / len(train_dataloader)
#     print("Average train loss: {}".format(avg_train_loss))
    
    loss_values.append(avg_train_loss)
    
    ########################################
    ## Validate
    ########################################
    print()
    print("Running Validation...")
    t0 = time.time()
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    for batch in valid_dataloader:
#         batch = tuple(t.to(device) for t in batch)
#         b_input_ids, b_input_mask, b_labels = batch
        
#         b_input_ids = torch.tensor(b_input_ids).to(torch.int64)
#         b_input_mask = torch.tensor(b_input_mask).to(torch.int64)
#         b_labels = torch.tensor(b_labels).to(torch.int64)

        b_input_ids = batch['word_tokens'].to(device).to(torch.int64)
        b_input_mask = batch['masks'].to(device).to(torch.int64)
        b_labels = batch['tags'].to(device).to(torch.int64)
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
    
    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    
    pred_tags = []
    for p, l in zip(predictions, true_labels):
        row = [tag_values[p_i] for p_i, l_i in zip(p, l) if tag_values[l_i] != 'PAD']
        pred_tags.append(row)
    
    valid_tags = []
    for l in true_labels:
        row = [tag_values[l_i] for l_i in l if tag_values[l_i] != 'PAD']
        valid_tags.append(row)
    
    print('Validation Accuracy: {}'.format(accuracy_score(pred_tags, valid_tags)))
    print('Validation F1-Score: {}'.format(f1_score(pred_tags, valid_tags)))
    print()

print('')
print('Training complete!')

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]


Training...
Batch    40 of 10,871. Elapsed: 0:00:16.
Batch    80 of 10,871. Elapsed: 0:00:30.
Batch   120 of 10,871. Elapsed: 0:00:45.
Batch   160 of 10,871. Elapsed: 0:01:00.
Batch   200 of 10,871. Elapsed: 0:01:15.
Batch   240 of 10,871. Elapsed: 0:01:30.
Batch   280 of 10,871. Elapsed: 0:01:46.
Batch   320 of 10,871. Elapsed: 0:02:01.
Batch   360 of 10,871. Elapsed: 0:02:16.
Batch   400 of 10,871. Elapsed: 0:02:31.
Batch   440 of 10,871. Elapsed: 0:02:46.
Batch   480 of 10,871. Elapsed: 0:03:01.
Batch   520 of 10,871. Elapsed: 0:03:17.
Batch   560 of 10,871. Elapsed: 0:03:32.
Batch   600 of 10,871. Elapsed: 0:03:47.
Batch   640 of 10,871. Elapsed: 0:04:02.
Batch   680 of 10,871. Elapsed: 0:04:17.
Batch   720 of 10,871. Elapsed: 0:04:33.
Batch   760 of 10,871. Elapsed: 0:04:48.
Batch   800 of 10,871. Elapsed: 0:05:03.
Batch   840 of 10,871. Elapsed: 0:05:18.
Batch   880 of 10,871. Elapsed: 0:05:34.
Batch   920 of 10,871. Elapsed: 0:05:49.
Batch   960 of 10,871. Elapsed: 0:06:04.
Bat

Epoch:  50%|█████     | 1/2 [1:16:19<1:16:19, 4579.26s/it]

Validation F1-Score: 0.9825051455454279


Training...
Batch    40 of 10,871. Elapsed: 0:00:15.
Batch    80 of 10,871. Elapsed: 0:00:31.
Batch   120 of 10,871. Elapsed: 0:00:46.
Batch   160 of 10,871. Elapsed: 0:01:02.
Batch   200 of 10,871. Elapsed: 0:01:17.
Batch   240 of 10,871. Elapsed: 0:01:33.
Batch   280 of 10,871. Elapsed: 0:01:48.
Batch   320 of 10,871. Elapsed: 0:02:04.
Batch   360 of 10,871. Elapsed: 0:02:19.
Batch   400 of 10,871. Elapsed: 0:02:35.
Batch   440 of 10,871. Elapsed: 0:02:50.
Batch   480 of 10,871. Elapsed: 0:03:06.
Batch   520 of 10,871. Elapsed: 0:03:21.
Batch   560 of 10,871. Elapsed: 0:03:37.
Batch   600 of 10,871. Elapsed: 0:03:52.
Batch   640 of 10,871. Elapsed: 0:04:08.
Batch   680 of 10,871. Elapsed: 0:04:23.
Batch   720 of 10,871. Elapsed: 0:04:39.
Batch   760 of 10,871. Elapsed: 0:04:55.
Batch   800 of 10,871. Elapsed: 0:05:10.
Batch   840 of 10,871. Elapsed: 0:05:26.
Batch   880 of 10,871. Elapsed: 0:05:41.
Batch   920 of 10,871. Elapsed: 0:05:57.
Bat

Epoch: 100%|██████████| 2/2 [2:32:50<00:00, 4585.22s/it]  

Validation F1-Score: 0.9901484206051421


Training complete!





In [35]:
model.save_pretrained('./pytorch_bert_ner_model_v2/')

### Inference

In [36]:
tokenizer = BertTokenizer.from_pretrained(PRETRAIN_MODEL)
model = BertForTokenClassification.from_pretrained('C:\\Users\\stick\\kaggle\\pytorch_bert_ner_model_v2')
model = model.to(device)

In [37]:
testing_data = pd.read_csv("C:\\Users\\stick\\kaggle\\ner_test.csv",sep=",",encoding="utf8", header=None, keep_default_na=False, na_values=[], names=['Sentence', 'Word', 'Tag'])
testing_data.shape, testing_data

((2396710, 3),
          Sentence          Word Tag
 0        S2682021       SeaWiFS   O
 1        S2682021   ORM-derived   O
 2        S2682021             g   O
 3        S2682021             i   O
 4        S2682021         ͑443͒   O
 ...           ...           ...  ..
 2396705   S794046             a   O
 2396706   S794046  multiplicity   O
 2396707   S794046            of   O
 2396708   S794046      purposes   O
 2396709   S794046             .   O
 
 [2396710 rows x 3 columns])

In [38]:
testing_getter = SentenceGetter(testing_data)

In [39]:
sentences = [[tu[0] for tu in sentence] for sentence in testing_getter.sentences]
print(sentences[0])
labels = [[tu[1] for tu in sentence] for sentence in testing_getter.sentences]
print(labels[0])

['In', 'Pipeline', '2', ',', 'the', 'group-level', 'correlation', 'maps', 'of', 'each', 'ROI', 'were', 'saved', 'to', 'a', 'binary', 'mask', '(', 'with', 'positive', 'and', 'negative', 'functional', 'connectivity', 'separated', ')', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [40]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)
]

In [41]:
tokenized_texts_and_labels[:2]

[(['In',
   'Pi',
   '##pel',
   '##ine',
   '2',
   ',',
   'the',
   'group',
   '-',
   'level',
   'correlation',
   'maps',
   'of',
   'each',
   'R',
   '##O',
   '##I',
   'were',
   'saved',
   'to',
   'a',
   'binary',
   'mask',
   '(',
   'with',
   'positive',
   'and',
   'negative',
   'functional',
   'connectivity',
   'separated',
   ')',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']),
 (['The',
   'structural',
   'and',
   'resting',
   'state',
   'functional',
   'MR',
   '##I',
   'data',
   'of',
   'MC',
   '##I',
   'patients',
   'used',
   'in',
   'the',
   'present',
   'study',
   'were',
   'obtained',
   'from',
   'large',
   'multi',
   '##cent',
   '##er',
   'Alzheimer',
   "'",
   's',
   'Disease',
   'N',
   '##eur',
   '#

In [42]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [43]:
input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], 
    maxlen=MAX_LEN, 
    dtype='long', 
    value=0.0, 
    truncating=TRUNCATING_TYPE, 
    padding=PADDING_TYPE
)
tags = pad_sequences(
    [[tag2idx.get(l) for l in lab] for lab in labels], 
    maxlen=MAX_LEN, 
    dtype='long',
    value=tag2idx["PAD"],
    truncating=TRUNCATING_TYPE,
    padding=PADDING_TYPE
)

In [44]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [45]:
testing_dataset = SeanDataset(
    word_tokens=input_ids,
    masks=attention_masks,
    tags=tags
)
    
testing_dataloader = DataLoader(
    testing_dataset,
    batch_size=BATCH_SIZE,
)

In [46]:
for d in testing_dataloader:
    print(d)
    break

{'word_tokens': tensor([[ 1130, 21902, 10522,  ...,     0,     0,     0],
        [ 1109,  8649,  1105,  ...,     0,     0,     0],
        [ 7154,  1215,  1107,  ...,     0,     0,     0],
        ...,
        [ 7549,  3622,  1104,  ...,     0,     0,     0],
        [ 1438,   117,  1142,  ...,     0,     0,     0],
        [ 7154,  1215,  1107,  ...,     0,     0,     0]]), 'masks': tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]]), 'tags': tensor([[0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3],
        ...,
        [0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3],
        [0, 0, 0,  ..., 3, 3, 3]])}


In [47]:
len(testing_dataloader)

1510

In [48]:
model.eval()
testing_loss, testing_accuracy = 0, 0
nb_testing_steps, nb_testing_examples = 0, 0
predictions, true_labels = [], []
testing_loss_values = []
for batch in testing_dataloader:
    b_input_ids = batch['word_tokens'].to(device).to(torch.int64)
    b_input_mask = batch['masks'].to(device).to(torch.int64)
    b_labels = batch['tags'].to(device).to(torch.int64)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    testing_loss += outputs[0].mean().item()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

testing_loss = testing_loss / len(testing_dataloader)
validation_loss_values.append(testing_loss)
print("Testing loss: {}".format(testing_loss))

pred_tags = []
for p, l in zip(predictions, true_labels):
    row = [tag_values[p_i] for p_i, l_i in zip(p, l) if tag_values[l_i] != 'PAD']
    pred_tags.append(row)

testing_tags = []
for l in true_labels:
    row = [tag_values[l_i] for l_i in l if tag_values[l_i] != 'PAD']
    testing_tags.append(row)

print('Testing Accuracy: {}'.format(accuracy_score(pred_tags, testing_tags)))
print('Testing F1-Score: {}'.format(f1_score(pred_tags, testing_tags)))

Testing loss: 0.0015102616006626422
Testing Accuracy: 0.9996206797862479
Testing F1-Score: 0.9905139646339448


In [None]:
# Evaluate testing dataset


### Manual input inference

In [8]:
import json
import nltk
import csv
from nltk.tokenize import word_tokenize

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRETRAIN_MODEL)
model = BertForTokenClassification.from_pretrained('C:\\Users\\stick\\kaggle\\pytorch_bert_ner_model')

In [9]:
def get_dataset_tag(tokenized_sentence, pred_tags):
    dataset_names = []
    dataset_name = ''
    for (token, tag) in zip(tokenized_sentence, pred_tags):
        if tag == 'B-D' or tag == 'I-D':
            if token.startswith('##'):
                dataset_name += token.replace('##', '')
            else:
                dataset_name += ' ' + token
        else:
            if len(dataset_name) > 0:
                dataset_names.append(dataset_name.strip())
                dataset_name = ''
    return dataset_names

In [10]:
sentences = []

with open(f'C:\\Users\\stick\\kaggle\\0087b0b4-deda-471a-a8b7-706b9dc24990.json') as json_file:
    data = json.load(json_file)

    for section in data:
        # Skip section_title
#             section_title = section['section_title']
#             if '\n' not in section_title:
#                 paras.append(section_title)
#             else:
#                 paras.extend(section_title.split('\n'))

        text = section['text']
        section_sentences = nltk.sent_tokenize(text)
        for s in section_sentences:
            sentences.append(s)

In [15]:
for s in sentences:
    words = word_tokenize(s)
    tokenized_sentence = []
    for w in words:
        tokenized_word = tokenizer.tokenize(w)
        tokenized_sentence.extend(tokenized_word)
        
    # print(tokenized_sentence)
    input_ids = pad_sequences(
        [tokenizer.convert_tokens_to_ids(tokenized_sentence)], 
        maxlen=MAX_LEN, 
        dtype='long', 
        value=0.0, 
        truncating=TRUNCATING_TYPE, 
        padding=PADDING_TYPE
    )
    # print(input_ids)
    
    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
    
    input_ids = torch.tensor(input_ids, dtype=torch.long),
    attention_masks = torch.tensor(attention_masks, dtype=torch.float),
    # print(attention_masks)
    
    model.eval()
    b_input_ids = input_ids[0].to(device).to(torch.int64)
    b_input_mask = attention_masks[0].to(device).to(torch.int64)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs[0].detach().cpu().numpy()
#     print(logits)
    predictions = np.argmax(logits, axis=2)[0]
#     print(predictions)
    pred_tags = [tag_values[w] for w in predictions]
    if 'B-D' in pred_tags or 'I-D' in pred_tags:
        dataset_names = get_dataset_tag(tokenized_sentence, pred_tags)
        print(dataset_names)

['Survey of Earned Doctorates']


In [16]:
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained(PRETRAIN_MODEL)
config = AutoConfig.from_pretrained(PRETRAIN_MODEL)

tokenizer.save_pretrained('C:\\Users\\stick\\kaggle\\tokenizer')
config.save_pretrained('C:\\Users\\stick\\kaggle\\tokenizer')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=570.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=435797.0), HTML(value='')))


