In [2]:
import warnings
warnings.filterwarnings('ignore')

import io
import random
import numpy as np
import mxnet as mx
import gluonnlp as nlp
from sentence_embedding.bert import data, model

np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

ctx = mx.cpu()

In [3]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)
print(bert_base)

BERTModel(
  (encoder): BERTEncoder(
    (dropout_layer): Dropout(p = 0.1, axes=())
    (layer_norm): BERTLayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
    (transformer_cells): HybridSequential(
      (0): BERTEncoderCell(
        (dropout_layer): Dropout(p = 0.1, axes=())
        (attention_cell): MultiHeadAttentionCell(
          (_base_cell): DotProductAttentionCell(
            (_dropout_layer): Dropout(p = 0.1, axes=())
          )
          (proj_query): Dense(768 -> 768, linear)
          (proj_key): Dense(768 -> 768, linear)
          (proj_value): Dense(768 -> 768, linear)
        )
        (proj): Dense(768 -> 768, linear)
        (ffn): BERTPositionwiseFFN(
          (ffn_1): Dense(768 -> 3072, linear)
          (activation): GELU()
          (ffn_2): Dense(3072 -> 768, linear)
          (dropout_layer): Dropout(p = 0.1, axes=())
          (layer_norm): BERTLayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
        )
        (la

In [8]:
# read in train data set

def read_in_tokenizer(filename):
    num_discard_samples = 1
    # by comma
    field_separator = nlp.data.Splitter(',')
    # Fields to select from the file
    field_indices = [0,1] #[3,0]
    data_train_raw = nlp.data.TSVDataset(filename,
                                     field_separator=field_separator,
                                     num_discard_samples=num_discard_samples,
                                     field_indices=field_indices)
    # Sentence A & target
    sample_id=0
    print(data_train_raw[sample_id][0])
    print(data_train_raw[sample_id][1])
    
    # Use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)

    # The maximum length of an input sequence
    max_len = 128

    # The labels for the 4 classes
    all_labels = ["0", "1","2","3"]
    transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
                                                    class_labels=all_labels,
                                                    has_label=True,
                                                    pad=True,
                                                    pair=False)
    print('vocabulary used for tokenization = \n%s'%vocabulary)
    print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
    print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
    print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))

    return data_train_raw.transform(transform)
    
# trainfilename ="C:/Users/nwang/Desktop/nlp/bert/data/q2_balance_113_5/train.tsv"
trainfilename ='C:/Users/nwang/Desktop/nlp/code/q1_balance_1152_5_train.csv'
# "C:/Users/nwang/Desktop/nlp/code/q2_balance_339_5_train.csv"
data_train = read_in_tokenizer(trainfilename)
sample_id=0
print('token ids = \n%s'%data_train[sample_id][0])
print('valid length = \n%s'%data_train[sample_id][1])
print('segment ids = \n%s'%data_train[sample_id][2])
print('label = \n%s'%data_train[sample_id][3])

Not enough alliant credit unions brick and mortar
3
vocabulary used for tokenization = 
Vocab(size=30522, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3
token ids = 
[    2  2025  2438  2035  2937  2102  4923  9209  5318  1998 14335     3
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1 

In [5]:
model = model.classification.BERTClassifier(bert_base, num_classes=4, dropout=0.1)
# only need to initialize the classifier layer.
model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
model.hybridize(static_alloc=True)

# softmax cross entropy loss for classification
loss_function = mx.gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()



In [15]:
def bert_train(data_train):
    # The hyperparameters
    batch_size =32
    lr = 5e-6 #change from 6

    # The FixedBucketSampler and the DataLoader for making the mini-batches
    train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[1]) for item in data_train],
                                                batch_size=batch_size,
                                                shuffle=True)
    bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)

    trainer = mx.gluon.Trainer(model.collect_params(), 'adam',
                               {'learning_rate': lr, 'epsilon': 1e-9, 'wd': 0.01},update_on_kvstore=False)

    # Collect all differentiable parameters
    # `grad_req == 'null'` indicates no gradients are calculated (e.g. constant parameters)
    # The gradients for these params are clipped later
    params = [p for p in model.collect_params().values() if p.grad_req != 'null']
    
    log_interval =8
    num_epochs = 10 #change from 3
    for epoch_id in range(num_epochs):
        metric.reset()
        step_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(bert_dataloader):
            with mx.autograd.record():

                # Load the data
                token_ids = token_ids.as_in_context(ctx)
                valid_length = valid_length.as_in_context(ctx)
                segment_ids = segment_ids.as_in_context(ctx)
                label = label.as_in_context(ctx)

                # Forward computation
                out = model(token_ids, segment_ids, valid_length.astype('float32'))
                ls = loss_function(out, label).mean()

            # And backwards computation
            ls.backward()

            # Gradient clipping
            trainer.allreduce_grads()
            nlp.utils.clip_grad_global_norm(params, 1)
            trainer.update(1)

            step_loss += ls.asscalar()
            metric.update([label], [out])

            # Printing vital information
            if (batch_id + 1) % (log_interval) == 0:
                print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
                             .format(epoch_id, batch_id + 1, len(bert_dataloader),
                                     step_loss / log_interval,
                                     trainer.learning_rate, metric.get()[1]))
                step_loss = 0
        mx.nd.waitall()

In [16]:
bert_train(data_train)

[Epoch 0 Batch 8/40] loss=0.1221, lr=0.0000050, acc=0.983
[Epoch 0 Batch 16/40] loss=0.1633, lr=0.0000050, acc=0.980
[Epoch 0 Batch 24/40] loss=0.1407, lr=0.0000050, acc=0.975
[Epoch 0 Batch 32/40] loss=0.1654, lr=0.0000050, acc=0.970
[Epoch 0 Batch 40/40] loss=0.1706, lr=0.0000050, acc=0.967
[Epoch 1 Batch 8/40] loss=0.1528, lr=0.0000050, acc=0.965
[Epoch 1 Batch 16/40] loss=0.1178, lr=0.0000050, acc=0.968
[Epoch 1 Batch 24/40] loss=0.1952, lr=0.0000050, acc=0.962
[Epoch 1 Batch 32/40] loss=0.1420, lr=0.0000050, acc=0.963
[Epoch 1 Batch 40/40] loss=0.1422, lr=0.0000050, acc=0.962
[Epoch 2 Batch 8/40] loss=0.1321, lr=0.0000050, acc=0.971
[Epoch 2 Batch 16/40] loss=0.1653, lr=0.0000050, acc=0.963
[Epoch 2 Batch 24/40] loss=0.1110, lr=0.0000050, acc=0.965
[Epoch 2 Batch 32/40] loss=0.1553, lr=0.0000050, acc=0.965
[Epoch 2 Batch 40/40] loss=0.1621, lr=0.0000050, acc=0.964
[Epoch 3 Batch 8/40] loss=0.1002, lr=0.0000050, acc=0.964
[Epoch 3 Batch 16/40] loss=0.1172, lr=0.0000050, acc=0.975
[

In [19]:
predicts = mx.nd.array([[ 5.64021111e-01,  6.42217535e-01 , 3.33708376e-02 , 3.21580499e-01],
 [ 5.36897779e-01, -6.86344206e-02 , 1.65089130e-01 , 5.23052096e-01],
 [ 3.70361090e-01, -1.75818592e-01 , 3.87940586e-01,  4.53780949e-01]])
labels   = mx.nd.array([[1],
 [3],
 [2]])
acc = mx.metric.Accuracy()
acc.update(preds = predicts, labels = labels)
acc.get()

('accuracy', 0.3333333333333333)

In [12]:
def evaluate(data_train):
    batch_size = 32

    train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[1]) for item in data_train],
                                            batch_size=batch_size,
                                            shuffle=True)
    bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)

    metric.reset()
    results = []
    for _, seqs in enumerate(bert_dataloader):
        input_ids, valid_len, type_ids, label = seqs
        out = model(
            input_ids.as_in_context(ctx), type_ids.as_in_context(ctx),
            valid_len.astype('float32').as_in_context(ctx))

        metric.update([label], [out])
        indices = mx.nd.topk(out, k=1, ret_typ='indices', dtype='int32').asnumpy()
        for index in indices:
            results.append(int(index))
#     print(results)
    metric_nm, metric_val = metric.get()
    if not isinstance(metric_nm, list):
        metric_nm, metric_val = [metric_nm], [metric_val]
    metric_str = 'validation metrics:' + ','.join([i + ':%.4f' for i in metric_nm])

    return metric_nm, metric_val

In [18]:
testfilename = "C:/Users/nwang/Desktop/nlp/code/q1_balance_384_5_test.csv"
data_test = read_in_tokenizer(testfilename)

Now that we’ve resolved the issue with my credit card I am extremely pleased
0
vocabulary used for tokenization = 
Vocab(size=30522, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3


In [19]:
evaluate(data_test)

(['accuracy'], [0.796875])

In [10]:
evaluate(data_test)

[3, 2, 2, 2, 2, 3, 2, 1, 2, 3, 1, 2, 1, 2, 1, 2, 2, 3, 1, 2, 1, 0, 1, 1, 0, 1, 3, 1, 2, 3, 0, 0, 3, 1, 1, 0, 0, 0, 2, 2, 2, 3, 2, 1, 1, 1, 2, 1, 2, 3, 2, 3, 2, 2, 0, 1, 2, 3, 1, 2, 3, 2, 2, 1, 1, 1, 1, 3, 2, 2, 0, 0, 3, 0, 1, 3, 3, 0, 0, 0, 0, 1, 1, 0, 0, 3, 0, 1, 0, 3, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 3, 1, 3, 3, 0, 3, 1, 3, 3, 0, 1, 1]


(['accuracy'], [0.831858407079646])

In [17]:
params_saved='C:/Users/nwang/Desktop/nlp/bert/data/q2_balance_113_5/4classes/q1_model18_70'
nlp.utils.save_parameters(model, params_saved)

In [6]:
params_saved='C:/Users/nwang/Desktop/nlp/bert/data/q2_balance_113_5/4classes/q1_model18_70'
nlp.utils.load_parameters(model, params_saved)

In [9]:
def test(filename):
    
    num_discard_samples = 0
    # by comma
    field_separator = nlp.data.Splitter('\t')
    # Fields to select from the file
    field_indices = [0] #[3,0]
    data_train_raw = nlp.data.TSVDataset(filename,
                                     field_separator=field_separator,
                                     num_discard_samples=num_discard_samples,
                                     field_indices=field_indices)
    # Sentence A & target
    sample_id=0
    print(data_train_raw[sample_id][0])
    
    # Use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)

    # The maximum length of an input sequence
    max_len = 128

    # The labels for the 4 classes
#     all_labels = ["0", "1"]
    transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
#                                                     class_labels=all_labels,
                                                    has_label=False,
                                                    pad=True,
                                                    pair=False)
    print('vocabulary used for tokenization = \n%s'%vocabulary)
    print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
    print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
    print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))

    data_train = data_train_raw.transform(transform)

    batch_size = 32

    train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[1]) for item in data_train],
                                            batch_size=batch_size,
                                            shuffle=True)
    bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)

    metric.reset()       
        
    results = []
    for _, seqs in enumerate(bert_dataloader):
        input_ids, valid_length, type_ids = seqs
        out = model(input_ids.as_in_context(ctx),
                    type_ids.as_in_context(ctx),
                    valid_length.astype('float32').as_in_context(ctx))
        
        indices = mx.nd.topk(out, k=1, ret_typ='indices', dtype='int32').asnumpy()

        for index in indices:
            results.append(int(index))

    mx.nd.waitall()
    return results

testfilename ="C:/Users/nwang/Desktop/nlp/bert/data/q2_balance_113_5/4classes/dev.tsv"
test(testfilename)

None it was perfect Very easy
vocabulary used for tokenization = 
Vocab(size=30522, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
[PAD] token id = 1
[CLS] token id = 2
[SEP] token id = 3
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]