In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_matching/joint/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%tensorflow_version 2.x
!pip install transformers



In [None]:
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics import classification_report

import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
import pprint
import logging
import time
import math
import json
import csv

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.3.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [None]:
params = {
  'pretrain_path': 'bert-base-chinese',
  'train_path': '../data/train.json',
  'test_path': '../data/dev.json',
  'batch_size': 32,
  'max_len': 128,
  'buffer_size': 34334 + 100000,
  'init_lr': 1e-5,
  'max_lr': 4e-5,
  'n_epochs': 12,
  'clip_norm': 5.,
  'label_smooth': .0,
  'num_patience': 5,
}

tokenizer = BertTokenizer.from_pretrained(params['pretrain_path'],
                                          lowercase = True,
                                          add_special_tokens = True)

In [None]:
def get_vocab(f_path):
  k2v = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      k2v[line] = i
  return k2v

In [None]:
def data_gen_cs():
  f_path = '../data/test.csv'
  with open(f_path) as f:
    print('Reading', f_path)
    for i, line in enumerate(csv.reader(f, delimiter=',')):
      if i == 0:
        continue
      text1, text2, label = line
      if len(text1) + len(text2) + 3 > params['max_len']:
        _max_len = (params['max_len'] - 3) // 2
        text1 = text1[:_max_len]
        text2 = text2[:_max_len]
      text1 = list(text1)
      text2 = list(text2)
      text = ['[CLS]'] + text1 + ['[SEP]'] + text2 + ['[SEP]']
      seg = [0] + [0] * len(text1) + [0] + [1] * len(text2) + [1]
      text = tokenizer.convert_tokens_to_ids(text)
      yield ((text, seg), int(label))


def data_gen_js():
  f_path = '../data/dev.json'
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      line = json.loads(line.rstrip())
      text1, text2, label = line['sentence1'], line['sentence2'], line['label']
      if len(text1) + len(text2) + 3 > params['max_len']:
        _max_len = (params['max_len'] - 3) // 2
        text1 = text1[:_max_len]
        text2 = text2[:_max_len]
      text1 = list(text1)
      text2 = list(text2)
      text = ['[CLS]'] + text1 + ['[SEP]'] + text2 + ['[SEP]']
      seg = [0] + [0] * len(text1) + [0] + [1] * len(text2) + [1]
      text = tokenizer.convert_tokens_to_ids(text)
      yield ((text, seg), int(label))


def joint_data_gen():
  f_path = '../data/train.csv'
  with open(f_path) as f:
    print('Reading', f_path)
    for i, line in enumerate(csv.reader(f, delimiter=',')):
      if i == 0:
        continue
      text1, text2, label = line
      if len(text1) + len(text2) + 3 > params['max_len']:
        _max_len = (params['max_len'] - 3) // 2
        text1 = text1[:_max_len]
        text2 = text2[:_max_len]
      text1 = list(text1)
      text2 = list(text2)
      text = ['[CLS]'] + text1 + ['[SEP]'] + text2 + ['[SEP]']
      seg = [0] + [0] * len(text1) + [0] + [1] * len(text2) + [1]
      text = tokenizer.convert_tokens_to_ids(text)
      yield ((text, seg), int(label))
  f_path = '../data/train.json'
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      line = json.loads(line.rstrip())
      text1, text2, label = line['sentence1'], line['sentence2'], line['label']
      if len(text1) + len(text2) + 3 > params['max_len']:
        _max_len = (params['max_len'] - 3) // 2
        text1 = text1[:_max_len]
        text2 = text2[:_max_len]
      text1 = list(text1)
      text2 = list(text2)
      text = ['[CLS]'] + text1 + ['[SEP]'] + text2 + ['[SEP]']
      seg = [0] + [0] * len(text1) + [0] + [1] * len(text2) + [1]
      text = tokenizer.convert_tokens_to_ids(text)
      yield ((text, seg), int(label))

In [None]:
def get_datasets(params):
  _shapes = (([None], [None]), ())
  _types = ((tf.int32, tf.int32), tf.int32)
  _pads = ((0, 0), -1)
  
  ds_train = tf.data.Dataset.from_generator(
    lambda: joint_data_gen(),
    output_shapes = _shapes,
    output_types = _types,)
  ds_train = ds_train.shuffle(params['buffer_size'])
  ds_train = ds_train.padded_batch(params['batch_size'], _shapes, _pads)
  ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

  ds_test_js = tf.data.Dataset.from_generator(
    lambda: data_gen_js(),
    output_shapes = _shapes,
    output_types = _types,)
  ds_test_js = ds_test_js.padded_batch(params['batch_size'], _shapes, _pads)
  ds_test_js = ds_test_js.prefetch(tf.data.experimental.AUTOTUNE)

  ds_test_cs = tf.data.Dataset.from_generator(
    lambda: data_gen_cs(),
    output_shapes = _shapes,
    output_types = _types,)
  ds_test_cs = ds_test_cs.padded_batch(params['batch_size'], _shapes, _pads)
  ds_test_cs = ds_test_cs.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds_train, ds_test_js, ds_test_cs

In [None]:
# input stream ids check
(text, seg), _ = next(joint_data_gen())
print(text)
print(seg)

Reading ../data/train.csv
[101, 4500, 2544, 928, 6963, 127, 2399, 8024, 2544, 928, 3766, 3300, 2544, 5108, 6587, 1216, 5543, 102, 125, 511, 100, 100, 1384, 4772, 3341, 2544, 5108, 6587, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
class BertFinetune(tf.keras.Model):
  def __init__(self, params):
    super(BertFinetune, self).__init__()
    self.bert = TFBertModel.from_pretrained(params['pretrain_path'],
                                            trainable = True)
    self.drop_1 = tf.keras.layers.Dropout(.1)
    self.fc = tf.keras.layers.Dense(300, tf.nn.swish, name='down_stream/fc')
    self.drop_2 = tf.keras.layers.Dropout(.1)
    self.out = tf.keras.layers.Dense(1, name='down_stream/out')

  def call(self, bert_inputs, training):
    bert_inputs = [tf.cast(inp, tf.int32) for inp in bert_inputs]
    x = self.bert(bert_inputs, training=training)
    x = x[1]
    x = self.drop_1(x, training=training)
    x = self.fc(x)
    x = self.drop_2(x, training=training)
    x = self.out(x)
    x = tf.squeeze(x, 1)
    return x

In [None]:
model = BertFinetune(params)
model.build([[None, None], [None, None], [None, None]])
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


[('tf_bert_model/bert/embeddings/word_embeddings/weight:0',
  TensorShape([21128, 768])),
 ('tf_bert_model/bert/embeddings/position_embeddings/embeddings:0',
  TensorShape([512, 768])),
 ('tf_bert_model/bert/embeddings/token_type_embeddings/embeddings:0',
  TensorShape([2, 768])),
 ('tf_bert_model/bert/embeddings/LayerNorm/gamma:0', TensorShape([768])),
 ('tf_bert_model/bert/embeddings/LayerNorm/beta:0', TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/query/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/query/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/key/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/key/bias:0',
  TensorShape([768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/value/kernel:0',
  TensorShape([768, 768])),
 ('tf_bert_model/bert/encoder/layer_._0/attention/self/value/bias:0',
  TensorShape([768]

In [None]:
def label_smoothing(label, smooth):
  if smooth > 0.:
    return label * (1 - smooth) + 0.5 * smooth
  else:
    return label

In [12]:
decay_lr = tfa.optimizers.Triangular2CyclicalLearningRate(
  initial_learning_rate = params['init_lr'],
  maximal_learning_rate = params['max_lr'],
  step_size = 2 * params['buffer_size'] // params['batch_size'],)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0

best_acc, best_acc1, best_acc2 = .0, .0, .0
count = 0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

for _ in range(params['n_epochs']):
  ds_train, ds_test_js, ds_test_cs = get_datasets(params)

  # TRAINING
  for ((text, seg), labels) in ds_train:
    with tf.GradientTape() as tape:
      logits = model([text, tf.sign(text), seg], training=True)
      labels = tf.cast(labels, tf.float32)
      num_neg = tf.reduce_sum(tf.cast(tf.equal(labels, 0.), tf.float32)).numpy()
      num_pos = tf.reduce_sum(labels).numpy()
      if num_pos == 0.:
        pos_weight = 1.
      else:
        pos_weight = num_neg / num_pos
      loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(
        labels = label_smoothing(labels, params['label_smooth']),
        logits = logits,
        pos_weight = pos_weight))
    
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, model.trainable_variables))
    
    if global_step % 100 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
        global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
  
  # EVALUATION 1
  m = tf.keras.metrics.Accuracy()
  intent_true = []
  intent_pred = []

  for ((text, seg), labels) in ds_test_cs:
    logits = tf.sigmoid(model([text, tf.sign(text), seg], training=False))
    y_pred = tf.cast(tf.math.greater_equal(logits, .5), tf.int32)
    m.update_state(y_true=labels, y_pred=y_pred)
    intent_true += labels.numpy().flatten().tolist()
    intent_pred += y_pred.numpy().flatten().tolist()

  acc_1 = m.result().numpy()
  logger.info('测试集：微众银行智能客服')
  logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc_1))
  logger.info('\n'+classification_report(y_true = intent_true,
                                         y_pred = intent_pred,
                                         labels = [0, 1],
                                         target_names = ['Not Matched', 'Matched'],
                                         digits = 3))

  # EVALUATION 2
  m = tf.keras.metrics.Accuracy()
  intent_true = []
  intent_pred = []

  for ((text, seg), labels) in ds_test_js:
    logits = tf.sigmoid(model([text, tf.sign(text), seg], training=False))
    y_pred = tf.cast(tf.math.greater_equal(logits, .5), tf.int32)
    m.update_state(y_true=labels, y_pred=y_pred)
    intent_true += labels.numpy().flatten().tolist()
    intent_pred += y_pred.numpy().flatten().tolist()

  acc_2 = m.result().numpy()
  logger.info('测试集：蚂蚁金融语义相似度')
  logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc_2))
  logger.info('\n'+classification_report(y_true = intent_true,
                                         y_pred = intent_pred,
                                         labels = [0, 1],
                                         target_names = ['Not Matched', 'Matched'],
                                         digits = 3))

  # Define Where To Save Model and Stop Training
  acc = (acc_1 + acc_2) / 2.
  if acc > best_acc:
    best_acc = acc
    best_acc1 = acc_1
    best_acc2 = acc_2
    # you can save model here
    count = 0
  else:
    count += 1
  logger.info("Best | Accuracy 1: {:.3f} | Accuracy 2: {:.3f}".format(best_acc1, best_acc2))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break

Reading ../data/train.csv
Reading ../data/train.json
INFO:tensorflow:Step 0 | Loss: 0.7939 | Spent: 26.1 secs | LR: 0.000010
INFO:tensorflow:Step 100 | Loss: 0.6701 | Spent: 49.2 secs | LR: 0.000010
INFO:tensorflow:Step 200 | Loss: 0.4445 | Spent: 48.4 secs | LR: 0.000011
INFO:tensorflow:Step 300 | Loss: 0.4934 | Spent: 48.3 secs | LR: 0.000011
INFO:tensorflow:Step 400 | Loss: 0.7102 | Spent: 48.4 secs | LR: 0.000011
INFO:tensorflow:Step 500 | Loss: 0.8648 | Spent: 47.0 secs | LR: 0.000012
INFO:tensorflow:Step 600 | Loss: 0.5089 | Spent: 47.0 secs | LR: 0.000012
INFO:tensorflow:Step 700 | Loss: 0.4457 | Spent: 48.1 secs | LR: 0.000013
INFO:tensorflow:Step 800 | Loss: 0.4617 | Spent: 49.0 secs | LR: 0.000013
INFO:tensorflow:Step 900 | Loss: 0.3721 | Spent: 49.1 secs | LR: 0.000013
INFO:tensorflow:Step 1000 | Loss: 0.4169 | Spent: 50.7 secs | LR: 0.000014
INFO:tensorflow:Step 1100 | Loss: 0.5750 | Spent: 48.6 secs | LR: 0.000014
INFO:tensorflow:Step 1200 | Loss: 0.6129 | Spent: 48.4 secs