In [None]:
# built in
import json
import random
import os
# append to path to allow relative imports
import sys
sys.path.append("..")

In [None]:
!pip install rich transformers

In [None]:
# In case you need to connect your repository
! git clone https://<gh-token>@github.com/verrannt/show-us-the-data.git

In [None]:
# Navigate to it
os.chdir('show-us-the-data/')
! git pull
os.chdir('src/')

In [None]:
# Mount drive and set path to data
from google.colab import drive
drive.mount('/content/gdrive')

GDRIVE_DATA_PATH = 'gdrive/MyDrive/Datasets/show-us-the-data/'

In [None]:
# 3rd party
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rich.console import Console
from rich.progress import track
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
from transformers import BertForTokenClassification, BertModel, AdamW, BertTokenizerFast
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# own
from utils.data.preproc import Pipeline, PipelineConfigs
from utils.data.parse import ParseUtils
from utils.generic import timer

# Prep Data

In [None]:
data_path = os.path.join(
    os.path.abspath('../../'), # Root of project
    #GDRIVE_DATA_PATH
    'data/coleridgeinitiative-show-us-the-data/' # Data folder
)

configs = PipelineConfigs(
    DATA_PATH = data_path,
    MAX_LENGTH = 64,
    OVERLAP = 20,
    MAX_SAMPLE = None,
    SAVE = True,
    EXTRACTED_FILENAME = 'train_ner.data',
    TOKENIZED_FILENAME = 'train_ner.data.tokenized',
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
if n_gpu > 0: 
    torch.cuda.get_device_name(0)
    print('Running on GPU')

In [None]:
pipeline = Pipeline(configs)

In [None]:
input_ids, tags, attention_masks = pipeline.load_outputs()

In [None]:
#input_ids, tags, attention_masks = input_ids[:500], tags[:500], attention_masks[:500]

In [None]:
# Intermezzo fix tag labels
tag2id = {t:i for i, t in enumerate(np.unique(tags))}
tags = [[tag2id[tag] for tag in sent] for sent in tags]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(
    input_ids, 
    tags,
    random_state=2018, 
    test_size=0.1
)

tr_masks, val_masks, _, _ = train_test_split(
    attention_masks, 
    input_ids,
    random_state=2018, 
    test_size=0.1
)

In [None]:
tr_inputs = torch.tensor(tr_inputs).to(device)
val_inputs = torch.tensor(val_inputs).to(device)
tr_tags = torch.tensor(tr_tags).to(device)
val_tags = torch.tensor(val_tags).to(device)
tr_masks = torch.tensor(tr_masks).to(device)
val_masks = torch.tensor(val_masks).to(device)

In [None]:
BATCH_SIZE = 32

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(
    train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(
    valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

In [None]:
len(tr_inputs)

---

# Using `BertModel` to feed embeddings into separate classifier

In [None]:
# Get bert model. This will output raw hidden states
model = BertModel.from_pretrained(
    'bert-base-cased',
    num_labels = 3
).to(device)

In [None]:
# Create one layer linear classifier to be used on top
# of hidden states
cls = nn.Linear(
    model.config.hidden_size, 
    model.config.num_labels
).to(device)

In [None]:
@timer
def create_batched_embeddings(
    model,  # The HuggingFace BertModel to create the embeddings
    inputs, # The inputs to be fed to the model
    masks,  # Attention masks for the inputs (same shape as inputs)
    batch_size:int=32,
    size_limit:int=10000,
    store_dir:str='temp/'
):
    """
    Feed batches of size `batch_size` into the provided model from provided 
    inputs and masks, and return the resulting embeddings as a `torch.tensor`
    """
    _total = inputs.shape[0]
    # Drop the last data points if they don't fit with the batch size
    _total = (_total // batch_size) * batch_size
    print(f'Computing embeddings for {_total} items')

    # If the size gets too large
    if _total > size_limit:
        _backup_to_file = True
        _save_count = 1
        if not os.path.exists(store_dir):
            os.mkdir(store_dir)
    else:
        _backup_to_file = False

    def _save_job(file,count,store_dir):
        print(f"Saving at {count} items ... ", end = '')
        ParseUtils.save_outputs(
            file, 
            store_dir, 
            'embeddings.{}'
                .format(count)
        )
        print("Done.")

    # Collect embeddings in Python list
    _embeds = []

    for i in track(range(0, _total, batch_size),
                   description='Creating embeddings'):
        with torch.no_grad():
            _out = model(
                tr_inputs[i:i+batch_size], attention_mask=tr_masks[i:i+batch_size]
            ).last_hidden_state.detach().cpu().numpy()
        
        _embeds.append(_out)

        if _backup_to_file and i >= _save_count * size_limit:
            _save_count += 1
            _save_job(np.array(_embeds, dtype=np.float16), i, store_dir)
            # Free memory
            del _embeds
            _embeds = []
    
    if _backup_to_file:
        _save_job(_embeds, _total, store_dir)
        # Free memory
        del _embeds
        _embeds = []
 
    return _embeds

def create_batched_tensor(
    tensor,
    batch_size:int=32,
):
    """
    From any given tensor, split the first dimension into batches of size
    `batch_size` and return as a `torch.tensor`
    """
    total = tensor.shape[0] 
    # Drop the last data points if they don't fit with the batch size
    total = (total // batch_size) * batch_size
    
    return torch.tensor([
        tensor[i:i+batch_size].cpu().numpy()
        for i in track(range(0, total, batch_size),
            description='Creating batches')
    ])

In [None]:
!rm -rf ../../temp
os.mkdir('../../temp')

In [None]:
emb_data = create_batched_embeddings(
    model, 
    tr_inputs, 
    tr_masks, 
    batch_size=BATCH_SIZE, 
    size_limit=2000,
    store_dir = 'temp/'
)

In [None]:
import tarfile
def compress(tar_file, members):
    """
    Adds files (`members`) to a tar_file and compress it
    """
    # open file for gzip compressed writing
    tar = tarfile.open(tar_file, mode="w:gz")
    # with progress bar
    # set the progress bar
    progress = tqdm(members)
    for member in progress:
        # add file/folder/link to the tar file (compress)
        tar.add(member)
        # set the progress description of the progress bar
        progress.set_description(f"Compressing {member}")
    # close the file
    tar.close()

In [None]:
#compress("compressed.10016.tar.gz", ["../../temp/train_ner.data.tokenized.bert-base-cased.embeddings.bs32.10016"])

In [None]:
labels = create_batched_tensor(tr_tags, batch_size=BATCH_SIZE)
masks = create_batched_tensor(tr_masks, batch_size=BATCH_SIZE)

In [None]:
emb_data.shape, labels.shape, masks.shape

In [None]:
NUM_LABELS = model.config.num_labels
EPOCHS = 200
LEARNING_RATE = 3e-5

loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(cls.parameters(), lr=LEARNING_RATE)

In [None]:
loss_values, accuracies, f1_scores = [], [], []

In [None]:
total = emb_data.shape[0]

USE_MASK = True

for epoch in range(1,EPOCHS+1):
    
    # Collect metrics for all batches
    losses, accs, f1s = [], [], []
    
    for step, (X, y) in enumerate(zip(emb_data, labels)):
                
        # Compute prediction
        logits = cls(X)
        # Flatten over batch and sequence
        flat_logits = logits.view(-1, NUM_LABELS)
        
        # Compute loss
        if USE_MASK:
            loss_mask = masks[step].view(-1) == 1
            flat_y = torch.where(
                loss_mask, y.view(-1), torch.tensor(loss_fn.ignore_index).type_as(y)
            )
            loss = loss_fn(flat_logits, flat_y)
        else:
            flat_y = y.view(-1)
            loss = loss_fn(flat_logits, flat_y) 

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        # Detach and argmax
        flat_logits = np.argmax(flat_logits.detach(), axis=1)
        flat_y = flat_y.detach()
        
        # Compute metrics
        losses.append(loss.item())        
        accs.append(accuracy_score(flat_logits, flat_y))
        f1s.append(f1_score(flat_logits, flat_y, average='weighted'))
        
    loss = np.mean(losses)
    acc = np.mean(accs)
    f1 = np.mean(f1s)
        
    loss_values.append(loss)
    accuracies.append(acc)
    f1_scores.append(f1)
    
    if epoch % 50 == 0:
        print('[ Epoch {:3}/{} ]   Loss: {:.5f}   Acc: {:.5f}   F1: {:.5f}'.format(epoch, EPOCHS, loss, acc, f1))

In [None]:
plt.figure(figsize=(12,6))
plt.xlabel('Epochs')
plt.plot(loss_values, label='Loss')
plt.plot(accuracies, label='Accuracy')
plt.plot(f1_scores, label='F1 Score')
plt.legend()
plt.show()

---

# Using `BertForTokenClassification` with possible finetuning

In [None]:
model = BertForTokenClassification.from_pretrained(
    'bert-base-cased',
    num_labels = 3, # Hardcode for now
    output_attentions=False,
    output_hidden_states=False,
)

In [None]:
model.to(device)

In [11]:
# Models are initialized in eval mode by default. We can call model.train() to put it in train mode.
#model.train()

In [None]:
# Full finetuning to tune all model parameters
# Otherwise, only train classifier
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [None]:
EPOCHS = 4
LEARNING_RATE = 3e-5
EPSILON = 1e-8

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LEARNING_RATE,
    eps=EPSILON
)

In [None]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
from transformers import get_linear_schedule_with_warmup

max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
from seqeval.metrics import f1_score, accuracy_score

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
n_train_samples = len(train_dataloader)
n_val_samples = len(valid_dataloader)

for eidx in range(EPOCHS):

    print(f'Epoch {eidx}/{EPOCHS}')

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):

        print(f'Step {step}/{n_train_samples}')

        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:

        print(f'Step {step}/{n_val_samples}')

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()


In [None]:
test_outputs = model(tr_inputs[:32], attention_mask=tr_masks[:32], labels=tr_tags[:32])