In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_classification/imdb/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%tensorflow_version 2.x
!pip install tensorflow-addons



In [3]:
import tensorflow as tf
import numpy as np
import pprint
import logging
import time

from tensorflow_addons.optimizers.cyclical_learning_rate import Triangular2CyclicalLearningRate

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.3.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [4]:
def data_generator(f_paths, params):
  for f_path in f_paths:
    
    with open(f_path) as f:
      print('Reading', f_path)
      for line in f:
        line = line.rstrip()
        label, text = line.split('\t')
        text = text.split(' ')

        words = [get_idx(params['word2idx'], w) for w in text]
        if len(words) >= params['max_word_len']:
          words = words[:params['max_word_len']]

        chars = []
        for w in text:
          temp = []
          for c in list(w):
            temp.append(get_idx(params['char2idx'], c))
          if len(temp) < params['max_char_len']:
            temp += [0] * (params['max_char_len'] - len(temp))
          else:
            temp = temp[:params['max_char_len']]
          chars.append(temp)
        if len(chars) >= params['max_word_len']:
          chars = chars[:params['max_word_len']]

        y = int(label)

        yield words, chars, y


def dataset(is_training, params):
  _shapes = ([None], [None, params['max_char_len']], ())
  _types = (tf.int32, tf.int32, tf.int32)
  _pads = (0, 0, -1)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_paths'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['num_samples'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_paths'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [5]:
class KernelAttentivePooling(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.kernel = tf.keras.layers.Dense(1,
                                        activation=tf.tanh,
                                        use_bias=False)

  
  def call(self, inputs, training=False):
    x, masks = inputs
    # alignment
    align = tf.squeeze(self.kernel(self.dropout(x, training=training)), -1)
    # masking
    paddings = tf.fill(tf.shape(align), float('-inf'))
    align = tf.where(tf.equal(masks, 0), paddings, align)
    # probability
    align = tf.nn.softmax(align)
    align = tf.expand_dims(align, -1)
    # weighted sum
    return tf.squeeze(tf.matmul(x, align, transpose_a=True), -1)

In [6]:
class Model(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    
    self.char_embedding = tf.keras.layers.Embedding(len(params['char2idx'])+1, params['char_embed_size'])
    self.word_embedding = tf.Variable(np.load('../vocab/word.npy'),
                                      dtype=tf.float32,
                                      name='pretrained_embedding',
                                      trainable=False,)
    
    self.char_cnn = tf.keras.layers.Conv1D(filters=params['cnn_filters'], kernel_size=params['cnn_kernel_size'], activation=tf.nn.elu, padding='same')
    self.embed_drop = tf.keras.layers.Dropout(params['dropout_rate'])
    self.embed_fc = tf.keras.layers.Dense(params['cnn_filters'], tf.nn.elu, name='embed_fc')
    
    self.word_cnn = tf.keras.layers.Conv1D(filters=params['cnn_filters'], kernel_size=params['cnn_kernel_size'], activation=tf.nn.elu, padding='same')
    self.word_drop = tf.keras.layers.Dropout(params['dropout_rate'])

    self.attentive_pooling = KernelAttentivePooling(params)
    self.out_linear = tf.keras.layers.Dense(2)
  
  
  def call(self, inputs, training=False):    
    words, chars = inputs
    if words.dtype != tf.int32:
      words = tf.cast(words, tf.int32)
    
    masks = tf.sign(words)
    batch_sz = tf.shape(words)[0]
    word_len = tf.shape(words)[1]
    
    chars = self.char_embedding(chars)
    chars = tf.reshape(chars, (batch_sz*word_len, params['max_char_len'], params['char_embed_size']))
    chars = self.char_cnn(chars)
    chars = tf.reduce_max(chars, 1)
    chars = tf.reshape(chars, (batch_sz, word_len, params['cnn_filters']))
    
    words = tf.nn.embedding_lookup(self.word_embedding, words)
    
    x = tf.concat((words, chars), axis=-1)
    x = self.embed_drop(x, training=training)
    x = self.embed_fc(x)
    
    x = self.word_drop(x, training=training)
    x = self.word_cnn(x)

    x = self.attentive_pooling((x, masks), training=training)
    x = self.out_linear(x)
    
    return x

In [7]:
params = {
  'vocab_path': '../vocab/word.txt',
  'train_paths': [
    '../data/train_bt_part1.txt',
    '../data/train_bt_part2.txt',
    '../data/train_bt_part3.txt',
    '../data/train_bt_part4.txt',
    '../data/train_bt_part5.txt',
    '../data/train_bt_part6.txt',
  ],
  'test_paths': [
    '../data/test.txt',
  ],
  'num_samples': 25000*2,
  'num_labels': 2,
  'batch_size': 32,
  'max_word_len': 1000,
  'max_char_len': 10,
  'char_embed_size': 100,
  'cnn_filters': 300,
  'cnn_kernel_size': 5,
  'dropout_rate': .2,
  'kernel_size': 5,
  'num_patience': 6,
  'init_lr': 1e-4,
  'max_lr': 8e-4,
}

In [8]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx


def get_idx(symbol2idx, symbol):
  return symbol2idx.get(symbol, len(symbol2idx))

In [9]:
params['char2idx'] = get_vocab('../vocab/char.txt')
params['word2idx'] = get_vocab('../vocab/word.txt')


model = Model(params)
model.build(input_shape=[[None, None], [None, None, params['max_char_len']]])
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

step_size = 4 * params['num_samples'] // params['batch_size']
decay_lr = Triangular2CyclicalLearningRate(
  initial_learning_rate = params['init_lr'],
  maximal_learning_rate = params['max_lr'],
  step_size = step_size,)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0

count = 0
best_acc = .0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)


while True:
  # TRAINING
  for words, chars, labels in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      logits = model((words, chars), training=True)
      loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_true = tf.one_hot(labels, 2),
                                                               y_pred = logits,
                                                               from_logits = True,
                                                               label_smoothing = .2,))
    
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    optim.apply_gradients(zip(grads, model.trainable_variables))

    if global_step % 50 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
  
  # EVALUATION
  m = tf.keras.metrics.Accuracy()

  for words, chars, labels in dataset(is_training=False, params=params):
    logits = model((words, chars), training=False)
    y_pred = tf.argmax(logits, axis=-1)
    m.update_state(y_true=labels, y_pred=y_pred)
    
  acc = m.result().numpy()
  logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc))
  
  if acc > best_acc:
    best_acc = acc
    # you can save model here
    count = 0
  else:
    count += 1
  logger.info("Best Accuracy: {:.3f}".format(best_acc))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break

[('embedding/embeddings:0', TensorShape([80, 100])),
 ('conv1d/kernel:0', TensorShape([5, 100, 300])),
 ('conv1d/bias:0', TensorShape([300])),
 ('embed_fc/kernel:0', TensorShape([600, 300])),
 ('embed_fc/bias:0', TensorShape([300])),
 ('conv1d_1/kernel:0', TensorShape([5, 300, 300])),
 ('conv1d_1/bias:0', TensorShape([300])),
 ('kernel_attentive_pooling/dense/kernel:0', TensorShape([300, 1])),
 ('dense_1/kernel:0', TensorShape([300, 2])),
 ('dense_1/bias:0', TensorShape([2]))]
Reading ../data/train_bt_part1.txt
Reading ../data/train_bt_part2.txt
Reading ../data/train_bt_part3.txt
Reading ../data/train_bt_part4.txt
Reading ../data/train_bt_part5.txt
Reading ../data/train_bt_part6.txt
INFO:tensorflow:Step 0 | Loss: 0.6917 | Spent: 50.2 secs | LR: 0.000100
INFO:tensorflow:Step 50 | Loss: 0.6308 | Spent: 26.0 secs | LR: 0.000106
INFO:tensorflow:Step 100 | Loss: 0.6034 | Spent: 23.7 secs | LR: 0.000111
INFO:tensorflow:Step 150 | Loss: 0.5495 | Spent: 22.9 secs | LR: 0.000117
INFO:tensorflow