In [1]:
# built in
import json
import random
# append to path to allow relative imports
import sys
sys.path.append("..")

# 3rd party
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from tqdm import tqdm
from transformers import BertForTokenClassification, AdamW, BertTokenizerFast
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from rich.console import Console
from utils.data.preproc import Pipeline, PipelineConfigs
from utils.data.parse import ParseUtils
from utils.generic import timer
from sklearn.model_selection import train_test_split

from rich.progress import track

# own
from utils.data.parse import ParseUtils

# Prep Data

In [2]:
data_path = os.path.join(
    os.path.abspath('../../'), # Root of project
    'data/coleridgeinitiative-show-us-the-data/' # Data folder
)

configs = PipelineConfigs(
    DATA_PATH = data_path,
    MAX_LENGTH = 64,
    OVERLAP = 20,
    MAX_SAMPLE = None,
    SAVE = True,
    EXTRACTED_FILENAME = 'train_ner.data',
    TOKENIZED_FILENAME = 'train_ner.data.tokenized',
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
if n_gpu > 0: torch.cuda.get_device_name(0)

In [3]:
pipeline = Pipeline(configs)

In [4]:
input_ids, tags, attention_masks = pipeline.load_outputs()

In [5]:
# Intermezzo fix tag labels
tag2id = {t:i for i, t in enumerate(np.unique(tags))}
tags = [[tag2id[tag] for tag in sent] for sent in tags]

In [7]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)


In [8]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [9]:
BATCH_SIZE = 32

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

In [10]:
model = BertForTokenClassification.from_pretrained(
    'bert-base-cased',
    num_labels = 3, # Hardcode for now
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [11]:
# Models are initialized in eval mode by default. We can call model.train() to put it in train mode.
#model.train()

In [12]:
# Full finetuning to tune all model parameters
# Otherwise, only train classifier
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [13]:
EPOCHS = 4
LEARNING_RATE = 3e-5
EPSILON = 1e-8

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LEARNING_RATE,
    eps=EPSILON
)    


In [19]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [20]:
from transformers import get_linear_schedule_with_warmup

max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [21]:
from seqeval.metrics import f1_score, accuracy_score

In [22]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
n_train_samples = len(train_dataloader)
n_val_samples = len(valid_dataloader)

for eidx in range(EPOCHS):

    print(f'Epoch {eidx}/{EPOCHS}')

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):

        print(f'Step {step}/{n_train_samples}')

        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:

        print(f'Step {step}/{n_val_samples}')

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()


Epoch 0/4
Step 0/15792
Step 1/15792
Step 2/15792
Step 3/15792
Step 4/15792
Step 5/15792
Step 6/15792
Step 7/15792
Step 8/15792
Step 9/15792
Step 10/15792
Step 11/15792
Step 12/15792


KeyboardInterrupt: 

In [23]:
outputs = model(tr_inputs[:32], attention_mask=tr_masks[:32], labels=tr_tags[:32])

In [None]:
loss = outputs.loss
loss.backward()
optimizer.step()

In [27]:
outputs.logits.size()

torch.Size([32, 64, 3])