Code based in part on examples from transformers library (https://github.com/huggingface/transformers/) and the tutorial by Chris McCormick and Nick Ryan (https://mccormickml.com/2019/07/22/BERT-fine-tuning/)

In [0]:
!pip install transformers==2.0.0

In [0]:
import torch
import transformers

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn

from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, Sampler, RandomSampler, SequentialSampler
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from transformers import BertForSequenceClassification, BertTokenizer

In [0]:
# BERT weights, tokenizer, and model to be used
BERT_WEIGHTS = 'bert-base-multilingual-uncased'
BERT_TOKENIZER = BertTokenizer
BERT_MODEL = BertForSequenceClassification

# Maximum sequence length is 53 based on max length in training set (34) times 
# 1.5 plus 2 for CLS and SEP tokens
MAX_SEQ_LEN = 53

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
def load_and_prepare_data(filename: str,
                          delimiter: str = '\t',
                          header: str = None,
                          names: list = ('sentence', 'intent'),
                          test: bool = False,
                          unique_labels: list = None):
    """
    Loads in data from filename and prepares

    :param test:
    :param unique_labels:
    :param filename: path to data file
    :param delimiter: delimiter character for CSV (default: '\t')
    :param header: (default: None)
    :param names: data column names if header is None (default: ['sentence', 'intent'])
    :return:
    """
    # Make sure training set labels provided if loading test data
    if test:
        assert(unique_labels)

    names = list(names)

    df = pd.read_csv(filename,
                     delimiter=delimiter,
                     header=header,
                     names=names).dropna(how='any')

    # Make sure test set labels were present in training set
    if test or unique_labels:
        df = df[df.intent.isin(unique_labels)]

    print(f'Dataframe shape: {df.shape}')
    df.sample(10)

    sentences = df.sentence.values

    # Get all unique labels, assign an integer value to each
    if not test and not unique_labels:
        unique_labels = sorted(list(set(df.intent.values)))
    labels = [unique_labels.index(label) for label in df.intent.values]
    print(f'Unique labels: {unique_labels}')

    # Tokenize sentences based on weights from BERT_MODEL, adding CLS and SEP tokens
    tokenizer = BERT_TOKENIZER.from_pretrained(BERT_WEIGHTS, add_special_tokens=True)
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]

    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sentences]

    # Pad/truncate ends of input sequences to MAX_LEN
    input_ids = pad_sequences(input_ids, maxlen=MAX_SEQ_LEN, dtype='long',
                              padding="post", truncating="post")

    # TODO: find better way to do attention masks... what if embedding is (unrealistically) 0.0?
    attention_masks = [[1 if i else 0 for i in seq] for seq in input_ids]

    print(sentences[1])
    print(tokenized_sentences[1])
    print(attention_masks[1])
    len(attention_masks[1])

    if test:
        return map(torch.tensor, (input_ids, attention_masks, labels))

    # Split data and masks for training and validation
    x_train, x_validate, y_train, y_validate = train_test_split(input_ids, labels,
                                                                random_state=2018, test_size=0.1)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                           random_state=2018, test_size=0.1)

    # Convert all of our data into torch tensors, the required datatype for our model
    data = (x_train, x_validate, y_train, y_validate, train_masks, validation_masks)

    return map(torch.tensor, data), unique_labels

In [0]:
def make_dataloader_from_tensors(x: torch.tensor,
                                 y: torch.tensor,
                                 masks: torch.tensor,
                                 test: bool = False,
                                 batch_size: int = 32):
    """
    Constructs a DataLoader based on X, y, and masks

    :param x:
    :param y:
    :param test:
    :param masks:
    :param batch_size: (Default: 32
    :return:
    """
    sampler = SequentialSampler if test else RandomSampler

    # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
    # with an iterator the entire dataset does not need to be loaded into memory
    tensor_dataset = TensorDataset(x, masks, y)
    dataset_sampler = sampler(tensor_dataset)

    return DataLoader(tensor_dataset, sampler=dataset_sampler, batch_size=batch_size)


In [0]:
def build_BERT_model(bert_model, bert_weights, num_labels):
    # Load BERT model with pretrained weights and move to GPU if available
    model = bert_model.from_pretrained(bert_weights, num_labels=num_labels)
    if 'cuda' == device.type:
        model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]

    # This variable contains all of the hyperparemeter information our training loop needs
    optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=2e-5)
    scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=100, t_total=1000)

    return model, optimizer, scheduler

In [0]:
def score(y_true, y_pred, labels=None, scoring='f1-micro'):
    """
    Calculates a score given a gold standard and a test set
    :param y_true:
    :param y_pred:
    :param scoring: (default: 'f1-micro')
    :return:
    """
    score_funcs = {'f1-micro': lambda x, y: f1_score(y_true=x, y_pred=y, labels=labels, average='micro'),
                   'f1-macro': lambda x, y: f1_score(y_true=x, y_pred=y, labels=labels, average='macro'),
                   'average': lambda x, y: (x == y).mean()}
    assert(scoring in score_funcs)
    return score_funcs[scoring](y_true, y_pred)
    

In [0]:
def train(model, optimizer, scheduler, data_loader_train, data_loader_validate, epochs=4, seed=500):
    # Store our loss and accuracy for plotting
    # train_loss_set = []

    # Tracking variables
    tr_loss = 0.0
    tr_steps = 0

    model.zero_grad()  # zero out gradients

    train_iterator = trange(epochs, desc='Epoch')
    # TODO: set_seed(seed)

    for _ in train_iterator:
        epoch_iterator = tqdm(data_loader_train, desc='Iteration')
        for step, batch in enumerate(epoch_iterator):
            model.train()

            # send batch to GPU (if available)
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}

            # Forward pass
            outputs = model(**inputs)
            loss = outputs[0]

            # Backward pass
            loss.backward()

            tr_loss += loss.item()

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            tr_steps += 1

        print("Train loss: {}".format(tr_loss / tr_steps))

        # Validation
        validation_accuracy = evaluate(model, data_loader_validate)
        print('Validation accuracy: {}'.format(validation_accuracy))

    return tr_steps, tr_loss / tr_steps

In [0]:
def evaluate(model, data_loader_validate, scoring='average'):
    eval_loss, eval_accuracy = 0.0, 0.0
    eval_steps = 0

    for batch in data_loader_validate:
        model.eval()

        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            logits = outputs[1]

        preds = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].detach().cpu().numpy()

        # TODO: should scoring be "f1-micro"?
        preds = np.argmax(preds, axis=1)#.flatten()
        tmp_eval_accuracy = score(y_true=label_ids, y_pred=preds, scoring=scoring)

        eval_accuracy += tmp_eval_accuracy
        eval_steps += 1

    return eval_accuracy / eval_steps

In [0]:
# TODO: get rid of the "true labels" stuff from here...
def predict(model, data_loader_predict):
    model.eval()

    predictions = list()
    for batch in data_loader_predict:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            logits = outputs[1]

        # Move logits and labels to CPU
        preds = logits.detach().cpu().numpy()

        # Store predictions and true labels
        predictions.extend(preds)

    return np.argmax(predictions, axis=1)

In [0]:
# This is where the magic starts...

In [0]:
# Snips
(x_train, x_validate, y_train, y_validate, train_masks, validation_mask), unique_labels = \
    load_and_prepare_data('./snips_train.tsv')

In [0]:
# Snips small
(x_train, x_validate, y_train, y_validate, train_masks, validation_mask), unique_labels = \
    load_and_prepare_data('./snips_small_train.tsv')

In [0]:
# Almawave-SLU
(x_train, x_validate, y_train, y_validate, train_masks, validation_mask), unique_labels = \
    load_and_prepare_data('./aw_slu_train.tsv')

In [0]:
# Almawave-SLU small
(x_train, x_validate, y_train, y_validate, train_masks, validation_mask), unique_labels = \
    load_and_prepare_data('./aw_slu_small_train.tsv')

In [0]:
# ATIS
(x_train, x_validate, y_train, y_validate, train_masks, validation_mask), unique_labels = \
    load_and_prepare_data('./atis_train.tsv')

In [0]:
# ATIS small
(x_train, x_validate, y_train, y_validate, train_masks, validation_mask), unique_labels = \
    load_and_prepare_data('./atis_small_train.tsv')

In [0]:
data_loader_train = make_dataloader_from_tensors(x_train, y_train, train_masks)
data_loader_validate = make_dataloader_from_tensors(x_validate, y_validate, validation_mask)

In [0]:
num_labels = len(unique_labels)
print(num_labels)
print(unique_labels)

In [0]:
model, optimizer, scheduler = build_BERT_model(bert_model=BERT_MODEL,
                                               bert_weights=BERT_WEIGHTS,
                                               num_labels=num_labels)

In [0]:
train(model, optimizer, scheduler, data_loader_train, data_loader_validate)

In [0]:
# Snips test ("dev")
prediction_ids, prediction_masks, prediction_labels = \
    load_and_prepare_data('snips_validate.tsv', test=True, unique_labels=unique_labels)

In [0]:
# Almawave-SLU test
prediction_ids, prediction_masks, prediction_labels = \
    load_and_prepare_data('aw_slu_test.tsv', test=True, unique_labels=unique_labels)

In [0]:
# ATIS test
prediction_ids, prediction_masks, prediction_labels = \
    load_and_prepare_data('atis_dev.tsv', test=True, unique_labels=unique_labels)

In [0]:
prediction_data_loader = make_dataloader_from_tensors(prediction_ids, prediction_labels,
                                                      prediction_masks, test=True)

In [0]:
preds = predict(model, prediction_data_loader)


In [0]:
true = np.array(prediction_labels)

In [0]:
score(true, preds, scoring='average')

In [0]:
score(true, preds, scoring='f1-micro')

In [0]:
score(true, preds, labels=np.unique(preds), scoring='f1-macro')

In [0]:
y_true = true
y_pred = preds

In [0]:
conf = confusion_matrix(y_true, y_pred, labels=[x for x in range(len(unique_labels))])
df_cm = pd.DataFrame(conf, index=unique_labels, columns=unique_labels)


In [0]:
# Snips and Almawave-SLU
sn.heatmap(df_cm, annot=True, fmt='g', cmap=plt.cm.Blues).set_ylim(7.0, 0.0)


In [0]:
# ATIS
sn.heatmap(df_cm, annot=True, fmt='g', cmap=plt.cm.Blues).set_ylim(17.0, 0.0)


In [0]:
# ATIS
plt.clf()
plt.subplots(figsize=(20,15))
# font = {'weight' : 'normal',
#         'size'   : 22}
# plt.rc('font', **font)
chart = sn.heatmap(df_cm, annot=True, fmt='g', cmap=plt.cm.Blues)
# chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
chart.set_ylim(17.0, 0.0)
chart



In [0]:
chart.figure.savefig('atis_small_bert_cm.svg')

In [0]:
from collections import Counter
Counter(y_pred)

In [0]:
import pickle

In [0]:
pickle.dump(y_pred, open('atis_full_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('atis_full_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('atis_small_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('atis_small_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(unique_labels, open('atis_unique_labels.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('snips_full_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('snips_full_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('snips_small_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('snips_small_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(unique_labels, open('snips_unique_labels.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('snips_train_4_epochs_aslu_test_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('snips_train_4_epochs_aslu_test_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('snips_train_8_epochs_aslu_test_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('snips_train_8_epochs_aslu_test_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('aslu_full_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('aslu_full_bert_true.pkl', 'wb'))
pickle.dump(unique_labels, open('aslu_unique_labels.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('aslu_small_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('aslu_small_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('aslu_train_4_epochs_snips_test_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('aslu_train_4_epochs_snips_test_bert_true.pkl', 'wb'))

In [0]:
pickle.dump(y_pred, open('aslu_train_8_epochs_snips_test_bert_pred.pkl', 'wb'))
pickle.dump(y_true, open('aslu_train_8_epochs_snips_test_bert_true.pkl', 'wb'))