Various text classification experiments.



Imports and set-up:

In [1]:
%tensorflow_version 2.x
import numpy as np
import tensorflow as tf
import pandas as pd
import subprocess
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
import gensim
import re
import copy
import keras.backend as k
import sys
import time
import datetime, os
import keras

# TODO: actually implement distribution in the training loop
strategy = tf.distribute.get_strategy()

use_mixed_precision=False
tf.config.run_functions_eagerly(False)

is_tpu=None
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  is_tpu = True
except ValueError:
  is_tpu = False

if is_tpu:
  print('TPU available.')
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)
  if use_mixed_precision:
    policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
    tf.keras.mixed_precision.experimental.set_policy(policy)
else:
  print('No TPU available.')
  result = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE).stdout.decode("utf-8").strip()
  if "has failed" in result:
    print("No GPU available.")
  else:
    print(result)
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(tf.distribute.experimental.CollectiveCommunication.NCCL)
    if use_mixed_precision:
      policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
      tf.keras.mixed_precision.experimental.set_policy(policy)

No TPU available.
GPU 0: Tesla T4 (UUID: GPU-3734368d-8319-7999-26df-be3c30889be0)
Instructions for updating:
use distribute.MultiWorkerMirroredStrategy instead
INFO:tensorflow:Using MirroredStrategy with devices ('/device:GPU:0',)
INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:GPU:0',), communication = CommunicationImplementation.NCCL


Downloading the data

In [2]:
# Download the Sentiment140 dataset
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip


Loading and splitting the data

In [3]:
sen140 = pd.read_csv(
    "data/training.1600000.processed.noemoticon.csv", encoding='latin-1',
    names=["target", "ids", "date", "flag", "user", "text"])
sen140.head()
sen140 = sen140.sample(frac=1).reset_index(drop=True)
sen140 = sen140[['text', 'target']]
features, targets = sen140.iloc[:,0].values, sen140.iloc[:,1].values

print("A random tweet\t:", features[0])

# split between train and test sets
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.33)
# Scale classes to the [0, 1] range
y_train = y_train.astype("float32")/4.0
y_test = y_test.astype("float32")/4.0
# Shape
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

A random tweet	: Good Morning. so gutting back to college tomorrow 


Preprocessing data

In [4]:
# Standardizing and splitting the strings.

def process_tweet(x):
  x = x.strip()
  x = x.lower()
  x = re.sub(r"[^a-zA-Z0-9üöäÜÖÄß\.,!\?\-%\$€\/ ]+'", ' ', x) # :(
  x = re.sub('([\.,!\?\-%\$€\/])',r' \1 ', x)
  x = re.sub('\s{2,}', ' ', x)
  x = x.split()
  x.append("[&END&]")
  length = len(x)
  return x

tweets_train = []
tweets_test = []
for tweet in x_train:
  tweets_train.append(process_tweet(tweet[0]))
for tweet in x_test:
  tweets_test.append(process_tweet(tweet[0]))

In [5]:
# Building the initial vocab with all words from the training set
def add_or_update_word(_vocab, word):
  entry = None
  if word in _vocab:
    entry = _vocab[word]
    entry = (entry[0], entry[1]+1)
  else:
    entry = (len(_vocab), 1)
  _vocab[word] = entry

vocab_pre = {}
# "[&END&]" is for padding, "[&UNK&]" for unknown words
add_or_update_word(vocab_pre, "[&END&]")
add_or_update_word(vocab_pre, "[&UNK&]")
for tweet in tweets_train:
  for word in tweet:
    add_or_update_word(vocab_pre, word)


# limiting the vocabulary to only include words that appear at least 3 times
# in the training data set. Reduces vocab size to about 1/6th.
# This is to make it harder for the model to overfit by focusing on words that
# may only appear in the training data, and also to generally make it learn to 
# handle unknown words (more robust)
keys = vocab_pre.keys()
vocab = {}
vocab["[&END&]"] = 0
vocab["[&UNK&]"] = 1
for key in keys:
  freq = vocab_pre[key][1]
  index = vocab_pre[key][0]
  if freq >= 3 and index>1:
    vocab[key] = len(vocab)

# Replace words that have been removed from the vocabulary with "[&UNK&]" in
# both the training and testing data
def filter_unknown(_in, _vocab):
  for tweet in _in:
    for i in range(len(tweet)):
      if not tweet[i] in _vocab:
        tweet[i] = "[&UNK&]"

filter_unknown(tweets_train, vocab)
filter_unknown(tweets_test, vocab)

Using gensim word2vec to get a good word embedding.

In [6]:
# train the embedding. TODO: Save the result for later use, this takes some time
embedding_dims=128
embedding = gensim.models.Word2Vec(tweets_train, size=embedding_dims, min_count=0)

In [7]:
# convert the training and test data to their tokenized form based on the
# word indices that gensim's word2vec decided on.
def tokenize(_in, _vocab):
  _out = []
  for i in range(len(_in)):
    tweet = _in[i]
    wordlist = []
    for word in tweet:
      wordlist.append(_vocab[word].index)
    _out.append(wordlist)
  return _out

tokens_train = tokenize(tweets_train, embedding.wv.vocab)
tokens_test = tokenize(tweets_test, embedding.wv.vocab)

Creating modules and defining the model.

In [8]:
class SequenceCollapseAttention(tf.Module):
  '''
  Collapses a sequence of arbitrary length into num_out_entries entries from the
  sequence according to dot-product attention. A variable length sequence is
  thus collapsed to a fixed length.
  '''
  def __init__(self, num_out_entries, initializer=tf.keras.initializers.HeNormal, name=None):
      super().__init__(name=name)
      self.is_built = False
      self.num_out_entries = num_out_entries
      self.initializer = initializer()

  def __call__(self, keys, query):
      if not self.is_built:
          self.weights = tf.Variable(
              self.initializer([query.shape[-1], self.num_out_entries]), trainable=True)
          self.biases = tf.Variable(tf.zeros([self.num_out_entries]), trainable=True)
          self.is_built = True

      scores = tf.matmul(query, self.weights) + self.biases
      scores = tf.transpose(scores, perm=(0,2,1))
      scores = tf.nn.softmax(scores)
      output = tf.linalg.matmul(scores, keys)
      return output

class WordEmbedding(tf.Module):
  '''
  Creates a word-embedding module from a provided embedding matrix.
  '''
  def __init__(self, embedding_matrix, trainable=False, name=None):
      super().__init__(name=name)
      self.embedding = tf.Variable(embedding_matrix, trainable=trainable)

  # @tf.function #(experimental_relax_shapes=True)
  def __call__(self, x):
      return tf.nn.embedding_lookup(self.embedding, x)

class PositionalEncoding1D(tf.Module):
  '''
  Positional encoding as in the Attention Is All You Need paper. I hope.
  '''
  def __init__(self, axis=-2, base=100, name=None):
      super().__init__(name=name)
      self.axis=axis
      self.base=base

  @tf.function
  def __call__(self, x):
    sequence_length = tf.shape(x)[self.axis]
    d = tf.shape(x)[-1]
    T = tf.shape(x)[self.axis]
    pos_enc = tf.range(0, d/2, delta=1, dtype=tf.float32)
    pos_enc = (-2.0/tf.cast(d, dtype=tf.float32))*pos_enc
    base = tf.cast(tf.fill(tf.shape(pos_enc), self.base), dtype=tf.float32)
    pos_enc = tf.math.pow(base, pos_enc)
    pos_enc = tf.expand_dims(pos_enc, axis=0)
    pos_enc = tf.tile(pos_enc, [T,1])
    t = tf.expand_dims(tf.range(1, T+1, delta=1, dtype=tf.float32), axis=-1)
    pos_enc = tf.math.multiply(pos_enc, t)
    pos_enc_sin = tf.expand_dims(tf.math.sin(pos_enc), axis=-1)
    pos_enc_cos = tf.expand_dims(tf.math.cos(pos_enc), axis=-1)
    pos_enc = tf.concat((pos_enc_sin, pos_enc_cos), axis=-1)
    pos_enc = tf.reshape(pos_enc, [T,d])
    return x+pos_enc

class MLP_Block(tf.Module):
  '''
  With batch normalization before the activations.
  A regular old multilayer perceptron, hidden shapes are defined by the "shapes"
  argument.
  '''
  def __init__(self, shapes, initializer=tf.keras.initializers.HeNormal, name=None, activation=tf.nn.swish):
      super().__init__(name=name)
      self.is_built = False
      self.shapes = shapes
      self.initializer = initializer()
      self.weights = [None] * len(shapes)
      self.biases = [None] * len(shapes)
      self.batch_norms = [None] * len(shapes)
      self.activation = activation

  def _build(self, x):
      for n in range(0, len(self.shapes)):
          in_shape = x.shape[-1] if n == 0 else self.shapes[n - 1]
          factor = 1 if self.activation != tf.nn.crelu or n == 0 else 2
          self.weights[n] = tf.Variable(
              self.initializer([in_shape * factor, self.shapes[n]]), trainable=True)
          self.biases[n] = tf.Variable(tf.zeros([self.shapes[n]]), trainable=True)
          self.batch_norms[n] = layers.BatchNormalization(trainable=True)
      self.is_built = True

  def __call__(self, x):
      if not self.is_built:
          self._build(x)

      h = x
      for n in range(len(self.shapes)):
          h = tf.matmul(h, self.weights[n]) + self.biases[n]
          h = self.batch_norms[n](h)
          h = self.activation(h)

      return h

class SyntheticGradient(tf.Module):
    '''
    An implementation of synthetic gradients. When added to a model, this
    module will intercept incoming gradients and replace them by learned,
    synthetic ones.

    Depending on the dimensionality and magnitude of incoming gradients, the 
    chosen initializer, or activations, the gradients provided by the generator 
    might be too large in the beginning and lead to NANs.

    This can be mitigated by using a uniform initializer for the generator
    (default), training the generator for a number of epochs before
    generating the first gradient (default 16), using a bounded activation for
    the hidden layers of the generator (default tanh), changing the learning
    rate of the generator, but most directly and effectively by setting an
    output_scale for the generated gradient (default 1.0).

    When the model using this module does not learn, the generator might be too
    simple, the output_scale might be too low, the learning rate of the
    generator might be too large or too low, or there may be a bug of which i
    am not yet aware.

    The relative_generator_hidden_shapes list defines the shapes of the hidden
    layers of the generator as a multiple of its input dimension. For an affine
    transormation, pass an empty list.
    '''
    def __init__(self,
                 initializer=tf.keras.initializers.GlorotUniform,
                 activation=tf.nn.tanh,
                 relative_generator_hidden_shapes=[3,],
                 learning_rate=0.01,
                 first_batch_epochs=16,
                 sg_output_scale=1.0,
                 name=None):
      super().__init__(name=name)
      self.is_built = False
      self.initializer = initializer
      self.activation = activation
      self.relative_generator_hidden_shapes = relative_generator_hidden_shapes
      self.first_batch_epochs = first_batch_epochs
      self.sg_output_scale = sg_output_scale
      self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    def _build(self, xy, dy):
      '''
      Builds the gradient generator on its first run, and trains on the first
      incoming batch of gradeints for a number of epochs to avoid bad results
      (including NANs) in the first few batches where the generator still
      outputs bad approximations.
      '''
      if len(self.relative_generator_hidden_shapes)>0:
        generator_shape = [xy.shape[-1]*mult for mult in self.relative_generator_hidden_shapes]
        self._generator_hidden = MLP_Block(
            generator_shape,
            activation=self.activation,
            initializer=self.initializer)
      else:
        self._generator_hidden = tf.identity
        
      self._generator_out = MLP_Block(
          [dy.shape[-1]],
          activation=tf.identity,
          initializer=self.initializer)
      
      # train for a number of epochs on the first run, by default 16, to avoid
      # bad results in the beginning of training.
      for i in range(self.first_batch_epochs):
        self._model_grad(xy, dy)
      self.is_built = True

    @tf.function
    def _grad(self, x):
      '''
      Just an MLP, or just an affine transformation (as suggested in that google
      paper) if the hidden shape in the constructor is set to be empty. The
      divisions by size are an attempt to avoid NANs caused by gradients that 
      are too large. Still, especially when using no hidden layers here or when
      the input dimension and/or magnitude is large, the output scale needs to
      be set to some low value by trial and error until NANs no longer occur.
      '''
      x = tf.stop_gradient(x)
      x = self._generator_hidden(x)
      x = x/tf.cast(tf.shape(x)[-1], dtype=tf.float32)
      out = self._generator_out(x)
      out = tf.squeeze(out)
      out = out/tf.cast(tf.shape(out)[-1], dtype=tf.float32)
      return out*self.sg_output_scale

    @tf.function
    def _model_grad(self, x, _target):
      '''
      Gradient descend for the gradient generator. This is called every time a
      gradient comes in, although in theory (especially with deeper gradient
      generators) once the gradients are modeled sufficiently, it could be OK to
      stop training on incoming gradients, thus fully decoupling the lower parts 
      of the network from the upper parts relative to this SG module.
      '''
      with tf.GradientTape() as tape:
          _sg = self._grad(x)
          l2_loss = _target - _sg
          l2_loss = tf.math.reduce_sum(tf.math.square(l2_loss), axis=-1)
          #l2_loss = tf.math.sqrt(l2_dist)
          grads = tape.gradient(l2_loss, self.trainable_variables)
          self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

    @tf.custom_gradient
    def sg(self, x, y):
      '''
      In the forward pass it is essentially a no-op (identity). In the backwards
      pass it replaces the incoming gradient by a synthetic one.
      '''
      x = tf.identity(x)
      def grad(dy):
        # concat x and the label to be inputs for the generator:
        xy = self.concat_x_and_y(x,y)

        if not self.is_built:
            self._build(xy, dy)
        # train the generator on the incoming gradient:
        self._model_grad(xy, dy)
        
        # return the gradient. The second return value is the gradient for y,
        # which should be zero since we only need y (labels) to generate the 
        # synthetic gradients
        dy = self._grad(xy)
        return dy, tf.zeros(tf.shape(y))
      return x, grad

    def __call__(self, x, y):
      return self.sg(x, y)


    @tf.function
    def concat_x_and_y(self, x, y):
      '''
      Probably an overly complex yet incomplete solution to a rather small
      inconvenience.
      Inconvenience: The gradient generators take the output of the last module
      AND the target/labels of the network as inputs. But those two tensors can
      be of different shapes. The obvious solution would be to manually reshape
      the targets so they can be concatenated with the outputs of the past
      state. But because i wanted this SG module to be as "plug-and-play" as
      possible, i tried to attempt automatic reshaping.

      Should work for 1d->1d, and 1d-sequence -> 1d, possibly 1d seq->seq,
      unsure about the rest.
      '''
      # insert as many dims before the last dim of y to give it the same rank
      # as x
      amount = tf.math.maximum(tf.rank(x)-tf.rank(y), 0)
      new_shape = tf.concat((tf.shape(y)[:-1],
                             tf.tile([1], [amount]),
                             [tf.shape(y)[-1]]), axis=-1)
      y = tf.reshape(y, new_shape)

      # tile the added dims such that x and y can be concatenated
      # In order to tile only the added dims, i need to set the dimensions with
      # a length of 1 (except the last) to the length of the corresponding
      # dimensions in x, while setting the rest to 1. This is waiting to break.
      mask = tf.cast( tf.math.less_equal(tf.shape(y),
                                         tf.constant([1])), dtype=tf.int32)
      #ignore the last dim
      mask = tf.concat([mask[:-1],tf.constant([0])],axis=-1)

      zeros_to_ones = tf.math.subtract(tf.ones(tf.shape(mask), dtype=tf.int32), mask)
      # has ones where there is a one in the shape, now the 1s are set to the
      # length in x
      mask = tf.math.multiply(mask,tf.shape(x))
      # add ones to all other dimensions to preserve their shape
      mask = tf.math.add(zeros_to_ones, mask)
      # tile
      y = tf.tile(y, mask)
      return tf.concat((x, y), axis=-1)

class FlattenL2D(tf.Module):
    "Flattens the last two dimensions only"
    def __init__(self, name=None):
        super().__init__(name=name)

    def __call__(self, x):
        new_shape = tf.concat(
            (tf.shape(x)[:-2], [(tf.shape(x)[-1]) * (tf.shape(x)[-2])]),
            axis=-1)
        return tf.reshape(x, new_shape)


initializer=tf.keras.initializers.HeNormal


class SentimentAnalysisWithAttention(tf.Module):
    def __init__(self, name=None):
        super().__init__(name=name)

        # Structure and the idea behind it:
        # 1: The input sequence is embedded and gets positional encoding.
        # 2.1: An MLP block ('query') computes scores for the following
        #      attention layer for each entry in the sequence. Ie, it decides
        #      which words are worth a closer look.
        # 2.2: And attention layer selects n positionally encoded word
        #      embeddings from the input sequence based on the learned queries.
        # 3: The result is flattened into a tensor of known shape and a number
        #    of dense layers compute the final classification.

        self.embedding = WordEmbedding(embedding.wv.vectors)
        self.batch_norm = layers.BatchNormalization()
        self.pos_enc = PositionalEncoding1D()
        self.query = MLP_Block([256, 128], initializer=initializer)
        self.attention = SequenceCollapseAttention(num_out_entries=8,
                                                   initializer=initializer)
        self.flatten = FlattenL2D()
        self.dense = MLP_Block([512, 256, 128, 64], initializer=initializer)
        self.denseout = MLP_Block([1],
                                  initializer=initializer,
                                  activation=tf.nn.sigmoid)
        

        # Synthetic gradient modules for the various layers.
        self.sg_query = SyntheticGradient(first_batch_epochs=64,
                                          sg_output_scale=0.1,
                                          relative_generator_hidden_shapes=[9,3])
        self.sg_attention = SyntheticGradient()
        self.sg_dense = SyntheticGradient()

    def __call__(self, x, y=tf.constant([]), training=True):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.batch_norm(x)
        q = self.query(x)
        # q = self.sg_query(q, y)     #SG, commented out because it's slowing things down
        x = self.attention(x, q)
        x = self.flatten(x)
        x = self.sg_attention(x, y)   #SG
        x = self.dense(x)
        x = self.sg_dense(x, y)       #SG
        output = self.denseout(x)
        return output

model = SentimentAnalysisWithAttention()

In [9]:
class BatchGenerator(keras.utils.Sequence):
  '''
  This is a relic from the early days of this notepad, solving a problem i 
  no longer face, and i should probably remove this.
  Creates batches from the given data, specifically it pads the sequences
  per batch only as much as necessary to make every sequence withing a batch be
  of the same length.
  '''
  def __init__(self, inputs, labels, padding, batch_size):
      self.batch_size = batch_size
      self.labels = labels
      self.inputs = inputs
      self.padding = padding
      #self.on_epoch_end()

  def __len__(self):
    return int(np.floor(len(self.inputs) / self.batch_size))

  def __getitem__(self, index):
    max_length = 0
    start_index = index*self.batch_size
    end_index = start_index+self.batch_size
    for i in range(start_index, end_index):
      l = len(self.inputs[i])
      if l>max_length:
        max_length = l
    
    out_x = np.empty([self.batch_size, max_length], dtype='int32')
    out_y = np.empty([self.batch_size, 1], dtype='float32')
    for i in range(self.batch_size):
      out_y[i] = self.labels[start_index+i]
      tweet = self.inputs[start_index+i]
      l = len(tweet)
      l = min(l,max_length)
      for j in range(0, l):
        out_x[i][j] = tweet[j]
      for j in range(l, max_length):
        out_x[i][j] = self.padding
    return out_x, out_y

    #def on_epoch_end(self):


Training the model

In [10]:
@tf.function(experimental_relax_shapes=True)
def training_step(_model, _loss, metrics, _optimizer, _x_batch, _y_batch):
  with tf.GradientTape() as tape:
    predictions = _model(_x_batch, _y_batch)
    losses = _loss(_y_batch, predictions)

    for metric in metrics:
      metric.update_state(_y_batch, predictions)

    grads = tape.gradient(losses, _model.trainable_variables)
    _optimizer.apply_gradients(zip(grads, _model.trainable_variables))
  
def fit(_epochs, _model, _loss, _metrics, _optimizer, _training_generator):
  batch_time = -1
  for epoch in range(_epochs):
    out_length = 0
    start_e = time.time()
    start_p = time.time()
    num_batches = len(_training_generator)
    for b in range(num_batches):
        start_b = time.time()

        x_batch, y_batch = _training_generator[b]
        training_step(_model, _loss, _metrics, _optimizer, x_batch, y_batch)

        # progress output
        elapsed_t = time.time()-start_b
        if batch_time != -1:
            batch_time = 0.05*elapsed_t + 0.95*batch_time
        else:
            batch_time = elapsed_t
        if int(time.time()-start_p) >= 1 or b==(num_batches-1):
            start_p = time.time()
            eta = int((num_batches-b)*batch_time)
            ela = int(time.time()-start_e)
            if out_length != 0:
                sys.stdout.write("\b"*(out_length))
            out_string = "\nEpoch %d/%d,\tbatch %d/%d,\telapsed: %d/%ds " % (
                (epoch+1), _epochs, b+1, num_batches, ela, ela+eta)
            for metric in _metrics:
                out_string += "\t %s: %f" % (metric.name, float(metric.result()))
            out_length = len(out_string)
            sys.stdout.write(out_string)
    for metric in _metrics:
        metric.reset_states()
    sys.stdout.write("\n")

sgd=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = sgd
metrics = (tf.keras.metrics.BinaryCrossentropy(from_logits=True), tf.keras.metrics.BinaryAccuracy())

batch_size = 512
epochs = 4
padding = embedding.wv.vocab["[&END&]"].index

training_generator = BatchGenerator(tokens_train, y_train, padding, batch_size=batch_size)
fit(epochs, model, loss, metrics, optimizer, training_generator)



Epoch 1/4,	batch 4/2093,	elapsed: 5/7286s 	 binary_crossentropy: 0.750028	 binary_accuracy: 0.512207
Epoch 1/4,	batch 51/2093,	elapsed: 6/682s 	 binary_crossentropy: 0.702559	 binary_accuracy: 0.502413
Epoch 1/4,	batch 99/2093,	elapsed: 7/101s 	 binary_crossentropy: 0.694660	 binary_accuracy: 0.499980
Epoch 1/4,	batch 148/2093,	elapsed: 8/51s 	 binary_crossentropy: 0.684925	 binary_accuracy: 0.530485
Epoch 1/4,	batch 198/2093,	elapsed: 9/47s 	 binary_crossentropy: 0.674385	 binary_accurac

Testing it on test data

In [11]:
testing_generator = BatchGenerator(tokens_test, y_test, padding, batch_size=batch_size)

for metric in metrics:
    metric.reset_states()

@tf.function(experimental_relax_shapes=True)
def validation_step(_model, metrics, _x_batch, _y_batch):
  predictions = _model(_x_batch, _y_batch, training=False)
  for metric in metrics:
    metric.update_state(_y_batch, predictions)

def validate(_model, _metrics, _testing_generator):
  batch_time = -1
  out_length = 0
  start_e = time.time()
  start_p = time.time()
  num_batches = len(_testing_generator)
  for b in range(num_batches):
      start_b = time.time()

      x_batch, y_batch = _testing_generator[b]
      validation_step(_model, _metrics, x_batch, y_batch)

      # progress output
      elapsed_t = time.time()-start_b
      if batch_time != -1:
          batch_time = 0.05*elapsed_t + 0.95*batch_time
      else:
          batch_time = elapsed_t
      if int(time.time()-start_p) >= 1 or b==(num_batches-1):
          start_p = time.time()
          eta = int((num_batches-b)*batch_time)
          ela = int(time.time()-start_e)
          if out_length != 0:
              sys.stdout.write("\b"*(out_length+1))
          out_string = "Batch %d/%d,\telapsed: %d/%ds " % (
              b+1, num_batches, ela, ela+eta)
          for metric in _metrics:
              out_string += "\t %s: %f" % (metric.name, float(metric.result()))
          out_length = len(out_string)
          sys.stdout.write(out_string)
  for metric in _metrics:
      metric.reset_states()

validate(model, metrics, testing_generator)

Batch 23/1031,	elapsed: 1/115s 	 binary_crossentropy: 0.597208	 binary_accuracy: 0.792120Batch 117/1031,	elapsed: 2/12s 	 binary_crossentropy: 0.596090	 binary_accuracy: 0.788962Batch 213/1031,	elapsed: 3/11s 	 binary_crossentropy: 0.596657	 binary_accuracy: 0.788036Batch 310/1031,	elapsed: 4/11s 	 binary_crossentropy: 0.596614	 binary_accuracy: 0.789088Batch 404/1031,	elapsed: 5/11s 	 binary_crossentropy: 0.597216	 binary_accuracy: 0.788714Batch 502/1031,	elapsed: 6/11s 	 binary_crossentropy: 0.596570	 binary_accuracy: 0.789183

Get some example results from the the test data.

In [12]:
@tf.function(experimental_relax_shapes=True)
def predict_step(_model, _x_batch):
  predictions = _model(_x_batch, training=False)
  return predictions

def predict(_model, generator):
  num_batches = len(generator)
  out = [None]*num_batches
  for b in range(num_batches):
    x_batch, y_batch = generator[b]
    out[b] = predict_step(_model, x_batch)
  return np.concatenate(out)

most_evil_tweet=None
most_evil_evilness=1
most_cool_tweet=None
most_cool_coolness=1
most_angelic_tweet=None
most_angelic_angelicness=0
y_pred = predict(model, testing_generator)
for i in range(0,len(y_pred)):
    judgement = y_pred[i]
    polarity = abs(judgement-0.5)*2

    if judgement>=most_angelic_angelicness:
        most_angelic_angelicness = judgement
        most_angelic_tweet = x_test[i]
    if judgement<=most_evil_evilness:
        most_evil_evilness = judgement
        most_evil_tweet = x_test[i]
    if polarity<=most_cool_coolness:
        most_cool_coolness = polarity
        most_cool_tweet = x_test[i]


print("The evilest tweet known to humankind:\n\t", most_evil_tweet)
print("Evilness: ", 1.0-most_evil_evilness)
print("\n")
print("The most angelic tweet any mortal has ever laid eyes upon:\n\t", most_angelic_tweet)
print("Angelicness: ", most_angelic_angelicness)
print("\n")
print("And this tweet is simply too cool for you:\n\t", most_cool_tweet)
print("Coolness: ", 1.0-most_cool_coolness)

The evilest tweet known to humankind:
	 ['SAD to miss Ink n Iron    I miss LA so bad !!']
Evilness:  [1.]


The most angelic tweet any mortal has ever laid eyes upon:
	 ['@featureBlend Thanks  Have a great weekend bro!']
Angelicness:  [1.]


And this tweet is simply too cool for you:
	 ["@maltesk As national representative for the Netherlands I'm quite ashamed  "]
Coolness:  [0.9999981]
