In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/clue/main')

Mounted at /content/gdrive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 3.9MB/s eta 0:00:01[K     |▌                               | 20kB 2.2MB/s eta 0:00:01[K     |▉                               | 30kB 3.1MB/s eta 0:00:01[K     |█                               | 40kB 2.6MB/s eta 0:00:01[K     |█▎                              | 51kB 3.1MB/s eta 0:00:01[K     |█▋                              | 61kB 3.6MB/s eta 0:00:01[K     |█▉                              | 71kB 4.0MB/s eta 0:00:01[K     |██                              | 81kB 4.3MB/s eta 0:00:01[K     |██▍                             | 92kB 4.7MB/s eta 0:00:01[K     |██▋                             | 102kB 4.5MB/s eta 0:00:01[K     |██▉                             | 112kB 4.5MB/s eta 0:00:01[K     |███▏                            | 122kB 4.5MB

In [3]:
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics import classification_report
import os
import json
import time
import logging
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.3.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [4]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx

params = {
  'pretrain_path': 'bert-base-chinese',
  'train_path': '../data/train.txt',
  'test_path': '../data/test.txt',
  'batch_size': 16,
  'buffer_size': 31728,
  'init_lr': 1e-5,
  'max_lr': 4e-5,
  'label_smooth': .2,
  'n_epochs': 12,
  'num_patience': 5,
}

params['label2idx'] = get_vocab('../vocab/label.txt')

tokenizer = BertTokenizer.from_pretrained(params['pretrain_path'],
                                          lowercase = True,
                                          add_special_tokens = True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [5]:
# stream data from text files
def data_generator(f_path, params):
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      line = json.loads(line.rstrip())
      text, label = line['content'], line['label']
      text = list(text)
      text = ['[CLS]'] + text + ['[SEP]']
      text = tokenizer.convert_tokens_to_ids(text)
      seg = [0] * len(text)
      label = params['label2idx'][label]
      yield (text, seg), int(label)


def dataset(is_training, params):
  _shapes = (([None], [None]), ())
  _types = ((tf.int32, tf.int32), tf.int32)
  _pads = ((0, 0), -1)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [6]:
# input stream ids check
(text, seg), _ = next(data_generator(params['train_path'], params))
print(text)
print(seg)

Reading ../data/train.txt
[101, 112, 872, 4761, 6887, 1914, 840, 1914, 7353, 6818, 3300, 784, 720, 1408, 8043, 1506, 1506, 3300, 4788, 2357, 5456, 100, 4696, 4638, 741, 677, 1091, 4638, 872, 1420, 1521, 100, 872, 2157, 6929, 1779, 4788, 2357, 3221, 686, 4518, 677, 3297, 1920, 4638, 4788, 2357, 8024, 1506, 1506, 8024, 7745, 872, 4638, 1568, 2124, 3221, 6432, 2225, 1217, 2861, 4478, 4105, 2357, 3221, 686, 4518, 677, 3297, 1920, 4638, 4105, 2357, 1568, 100, 1506, 1506, 1506, 112, 112, 4268, 4268, 8024, 1961, 4638, 1928, 1355, 5456, 8013, 2769, 812, 1920, 2812, 7370, 3488, 2094, 6963, 6206, 5436, 677, 3341, 2769, 4692, 1168, 3312, 1928, 5361, 7027, 3300, 1928, 1355, 100, 671, 2137, 3221, 166, 166, 809, 1184, 1931, 1168, 4638, 8024, 872, 6432, 3221, 679, 3221, 8043, 138, 4495, 4567, 140, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
class BertFinetune(tf.keras.Model):
  def __init__(self, params):
    super(BertFinetune, self).__init__()
    self.bert = TFBertModel.from_pretrained(params['pretrain_path'],
                                            trainable = True)
    self.drop_1 = tf.keras.layers.Dropout(.1)
    self.fc = tf.keras.layers.Dense(300, tf.nn.swish, name='down_stream/fc')
    self.drop_2 = tf.keras.layers.Dropout(.1)
    self.out = tf.keras.layers.Dense(len(params['label2idx']), name='down_stream/out')

  def call(self, bert_inputs, training):
    bert_inputs = [tf.cast(inp, tf.int32) for inp in bert_inputs]
    x = self.bert(bert_inputs, training=training)
    x = x[1]
    x = self.drop_1(x, training=training)
    x = self.fc(x)
    x = self.drop_2(x, training=training)
    x = self.out(x)
    return x

In [8]:
model = BertFinetune(params)
model.build([[None, None], [None, None], [None, None]])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=478309336.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [9]:
step_size = 2 * params['buffer_size'] // params['batch_size']
decay_lr = tfa.optimizers.Triangular2CyclicalLearningRate(
  initial_learning_rate = params['init_lr'],
  maximal_learning_rate = params['max_lr'],
  step_size = step_size,)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0

best_acc = .0
count = 0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

for _ in range(params['n_epochs']):
  # TRAINING
  for ((text, seg), labels) in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      logits = model([text, tf.sign(text), seg], training=True)
      loss = tf.compat.v1.losses.softmax_cross_entropy(
        tf.one_hot(labels, len(params['label2idx']), dtype=tf.float32),
        logits = logits,
        label_smoothing = params['label_smooth'],)
      
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, 5.)
    optim.apply_gradients(zip(grads, model.trainable_variables))
    
    if global_step % 100 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
  
  # EVALUATION
  m = tf.keras.metrics.Accuracy()
  intent_true = []
  intent_pred = []

  for ((text, seg), labels) in dataset(is_training=False, params=params):
    logits = model([text, tf.sign(text), seg], training=False)
    y_intent = tf.argmax(logits, -1)
    m.update_state(y_true=labels, y_pred=y_intent)
    intent_true += labels.numpy().flatten().tolist()
    intent_pred += y_intent.numpy().flatten().tolist()

  acc = m.result().numpy()
  logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc))

  logger.info('\n'+classification_report(y_true = intent_true,
                                         y_pred = intent_pred,
                                         labels = list(params['label2idx'].values()),
                                         target_names = list(params['label2idx'].keys()),
                                         digits=3))

  if acc > best_acc:
    best_acc = acc
    model.save_weights('../model/bert_finetune')
    count = 0
  else:
    count += 1
  logger.info("Best Accuracy: {:.3f}".format(best_acc))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break

Reading ../data/train.txt
INFO:tensorflow:Step 0 | Loss: 1.9473 | Spent: 7.5 secs | LR: 0.000010
INFO:tensorflow:Step 100 | Loss: 1.9586 | Spent: 37.1 secs | LR: 0.000011
INFO:tensorflow:Step 200 | Loss: 1.5733 | Spent: 37.1 secs | LR: 0.000012
INFO:tensorflow:Step 300 | Loss: 1.4134 | Spent: 37.1 secs | LR: 0.000012
INFO:tensorflow:Step 400 | Loss: 1.5317 | Spent: 36.8 secs | LR: 0.000013
INFO:tensorflow:Step 500 | Loss: 1.4112 | Spent: 37.1 secs | LR: 0.000014
INFO:tensorflow:Step 600 | Loss: 1.4376 | Spent: 37.2 secs | LR: 0.000015
INFO:tensorflow:Step 700 | Loss: 1.2993 | Spent: 37.3 secs | LR: 0.000015
INFO:tensorflow:Step 800 | Loss: 1.2889 | Spent: 37.0 secs | LR: 0.000016
INFO:tensorflow:Step 900 | Loss: 1.0945 | Spent: 36.8 secs | LR: 0.000017
INFO:tensorflow:Step 1000 | Loss: 1.5279 | Spent: 36.5 secs | LR: 0.000018
INFO:tensorflow:Step 1100 | Loss: 1.5483 | Spent: 37.1 secs | LR: 0.000018
INFO:tensorflow:Step 1200 | Loss: 1.1676 | Spent: 36.9 secs | LR: 0.000019
INFO:tensorf