<h1> Text Classification using TensorFlow/Keras on Cloud ML Engine </h1>

This notebook illustrates:
<ol>
<li> Creating datasets for Machine Learning using BigQuery
<li> Creating a text classification model using the Estimator API with a Keras model
<li> Training on Cloud ML Engine
<li> Deploying the model
<li> Predicting with model
<li> Rerun with pre-trained embedding
</ol>

In [1]:
# change these to try this notebook out
BUCKET = 'vijays-sandbox-ml'
PROJECT = 'vijays-sandbox'
REGION = 'us-central1'
SEED = 0

In [2]:
import tensorflow as tf
print(tf.__version__) # tf 2.0 nightly
print(tf.test.is_gpu_available())

2.0.0-beta1
True


# Pure Keras

In [11]:
tf.keras.layers.Conv1D?

[0;31mInit signature:[0m
[0mtf[0m[0;34m.[0m[0mkeras[0m[0;34m.[0m[0mlayers[0m[0;34m.[0m[0mConv1D[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfilters[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel_size[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrides[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding[0m[0;34m=[0m[0;34m'valid'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_format[0m[0;34m=[0m[0;34m'channels_last'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdilation_rate[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mactivation[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_bias[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel_initializer[0m[0;34m=[0m[0;34m'glorot_uniform'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias_initializer[0m[0;34m=[0m[0;34m'zeros'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel_regularizer[0m[0;

In [35]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import re
import pickle

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import Conv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D

from google.cloud import storage

CLASSES = {'github': 0, 'nytimes': 1, 'techcrunch': 2}  # label-to-int mapping
TOP_K = 20000  # Limit on the number vocabulary size used for tokenization
MAX_SEQUENCE_LENGTH = 50  # Sentences will be truncated/padded to this length

"""
Helper function to download data from Google Cloud Storage
  # Arguments:
      source: string, the GCS URL to download from (e.g. 'gs://bucket/file.csv')
      destination: string, the filename to save as on local disk. MUST be filename
      ONLY, doesn't support folders. (e.g. 'file.csv', NOT 'folder/file.csv')
  # Returns: nothing, downloads file to local disk
"""
def download_from_gcs(source, destination):
    search = re.search('gs://(.*?)/(.*)', source)
    bucket_name = search.group(1)
    blob_name = search.group(2)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    bucket.blob(blob_name).download_to_filename(destination)


"""
Parses raw tsv containing hacker news headlines and returns (sentence, integer label) pairs
  # Arguments:
      train_data_path: string, path to tsv containing training data.
        can be a local path or a GCS url (gs://...)
      eval_data_path: string, path to tsv containing eval data.
        can be a local path or a GCS url (gs://...)
  # Returns:
      ((train_sentences, train_labels), (test_sentences, test_labels)):  sentences
        are lists of strings, labels are numpy integer arrays
"""
def load_hacker_news_data(train_data_path, eval_data_path):
    if train_data_path.startswith('gs://'):
        download_from_gcs(train_data_path, destination='train.csv')
        train_data_path = 'train.csv'
    if eval_data_path.startswith('gs://'):
        download_from_gcs(eval_data_path, destination='eval.csv')
        eval_data_path = 'eval.csv'

    # Parse CSV using pandas
    column_names = ('label', 'text')
    df_train = pd.read_csv(train_data_path, names=column_names, sep='\t')
    df_eval = pd.read_csv(eval_data_path, names=column_names, sep='\t')

    return ((list(df_train['text']), np.array(df_train['label'].map(CLASSES))),
            (list(df_eval['text']), np.array(df_eval['label'].map(CLASSES))))


"""
Create tf.estimator compatible input function
  # Arguments:
      texts: [strings], list of sentences
      labels: numpy int vector, integer labels for sentences
      tokenizer: tf.python.keras.preprocessing.text.Tokenizer
        used to convert sentences to integers
      batch_size: int, number of records to use for each train batch
      mode: tf.estimator.ModeKeys.TRAIN or tf.estimator.ModeKeys.EVAL 
  # Returns:
      tf.data.dataset, produces feature and label
        tensors one batch at a time
"""
def input_fn(texts, labels, batch_size, mode):
    def _embed(sentence,label):
        #embeddings = embed(sentence)
        #return embeddings,label
        return sentence,label
    
    # Transform text to sequence of integers
    labels = tf.one_hot(labels,len(CLASSES)) #precision and recall metrics require one hot labels
    
    texts = [sentence.split() for sentence in texts]
    #texts = [(MAX_SEQUENCE_LENGTH * ['<PAD>'] + sentence)[-MAX_SEQUENCE_LENGTH:] for sentence in texts]
    texts = [(sentence + MAX_SEQUENCE_LENGTH * ['<PAD>'])[:MAX_SEQUENCE_LENGTH] for sentence in texts]
    embed = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1")
    texts = [embed(sentence) for sentence in texts]
    #1. Recieve string tensor : string ()
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    
    #dataset = dataset.map(_embed) # will need to change to flat_map
    #2. Tokenize: string (?,)
    #3. Pad (later try replacing with ragged tensor): string(max_seq_len,)
    #4. Embed w/ tf hub:  float (max_seq_len,embed_dim
    
    if mode == tf.estimator.ModeKeys.EVAL:
        return dataset.batch(batch_size)
    else: 
        return dataset.shuffle(50000).batch(batch_size)

"""
Builds a CNN model using keras and converts to tf.estimator.Estimator
  # Arguments
      model_dir: string, file path where training files will be written
      config: tf.estimator.RunConfig, specifies properties of tf Estimator
      filters: int, output dimension of the layers.
      kernel_size: int, length of the convolution window.
      embedding_dim: int, dimension of the embedding vectors.
      dropout_rate: float, percentage of input to drop at Dropout layers.
      pool_size: int, factor by which to downscale input at MaxPooling layer.
      embedding_path: string , file location of pre-trained embedding (if used)
        defaults to None which will cause the model to train embedding from scratch
      word_index: dictionary, mapping of vocabulary to integers. used only if
        pre-trained embedding is provided

    # Returns
        A tf.estimator.Estimator 
"""
def keras_estimator(model_dir,
                    config,
                    learning_rate,
                    filters=64,
                    dropout_rate=0.2,
                    embedding_dim=200,
                    kernel_size=3,
                    pool_size=3,
                    embedding_path=None,
                    word_index=None):
    # Create model instance.
    model = models.Sequential()

    # Add embedding layer
    #hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1", output_shape=[128], 
    #                   input_shape=[], dtype=tf.string)
    #model.add(hub_layer)
    model.add(tf.keras.layers.InputLayer(input_shape=(MAX_SEQUENCE_LENGTH,128)))
    model.add(Dropout(rate=dropout_rate))
    model.add(Conv1D(
        filters=filters,
        kernel_size=kernel_size,
        activation='relu',
        bias_initializer='random_uniform',
        padding='same'))

    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(len(CLASSES), activation='softmax'))

    # Compile model with learning parameters.
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(
        optimizer=optimizer, 
        loss='categorical_crossentropy', 
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ]
    )

    return model


"""
Defines the features to be passed to the model during inference 
  Expects already tokenized and padded representation of sentences
  # Arguments: none
  # Returns: tf.estimator.export.ServingInputReceiver
"""
def serving_input_fn():
    feature_placeholder = tf.compat.v1.placeholder(tf.string, [None])
    features = feature_placeholder  # pass as-is
    return tf.estimator.export.TensorServingInputReceiver(features, feature_placeholder)

In [34]:
%%time
dataset = input_fn(
    test_texts,
    test_labels,
    3,
    mode=tf.estimator.ModeKeys.EVAL
)
for batch in dataset:
    print(batch)
    break

(<tf.Tensor: id=872737, shape=(3, 50), dtype=string, numpy=
array([[b'show', b'hn', b'scrwl', b'shorthand', b'code', b'reading',
        b'and', b'writing', b'language', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>'],
       [b'geoip', b'module', b'on', b'nodejs', b'now', b'is', b'a', b'c',
        b'addon', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>', b'<PAD>',
        b'<PAD>',

In [37]:
%%time
hparams = {'train_data_path':'./data/txtcls/train.tsv',
           'eval_data_path':'./data/txtcls/eval.tsv',
           'batch_size':128}
# Load Data
((train_texts, train_labels), (test_texts, test_labels)) = load_hacker_news_data(
    hparams['train_data_path'], hparams['eval_data_path'])


train_dataset = input_fn(
    train_texts,
    train_labels,
    hparams['batch_size'],
    mode=tf.estimator.ModeKeys.TRAIN
)
eval_dataset = input_fn(
    test_texts,
    test_labels,
    hparams['batch_size'],
    mode=tf.estimator.ModeKeys.EVAL
)

CPU times: user 5min 9s, sys: 48.9 s, total: 5min 58s
Wall time: 4min 16s


In [40]:
hparams = {'learning_rate':.001,
           'num_epochs':3}
model = keras_estimator(
    model_dir='output_dir',
    config=None,
    learning_rate=hparams['learning_rate'],
    embedding_path=None,
    word_index=None
)

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_11 (Dropout)         (None, 50, 128)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 50, 64)            24640     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 16, 64)            0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 16, 128)           24704     
_________________________________________________________________
global_average_pooling1d_6 ( (None, 128)               0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 3)                

In [41]:
%%time
tf.random.set_seed(SEED)
model.fit(
    train_dataset,
    epochs=5,
    validation_data=eval_dataset,
    validation_steps=None
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 58.2 s, sys: 11.1 s, total: 1min 9s
Wall time: 55.4 s


<tensorflow.python.keras.callbacks.History at 0x7f70610f9710>