# Text classification with an RNN

This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.

In [1]:
from kubeflow import fairing
from kubeflow.fairing import TrainJob
import importlib
import argparse
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [2]:
USING_KATIB = False

In [3]:
def data_loader(hyperparams, local_data_dir):
    dataset, info = tfds.load('imdb_reviews/subwords8k', 
                              data_dir=local_data_dir,
                              with_info=True,
                              as_supervised=True)
    train_dataset, test_dataset = dataset['train'], dataset['test']
    encoder = info.features['text'].encoder
    train_dataset = train_dataset.shuffle(hyperparams['BUFFER_SIZE'])
    train_dataset = train_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
    test_dataset = test_dataset.padded_batch(hyperparams['BATCH_SIZE'], padded_shapes=None)
    return train_dataset, test_dataset, encoder

In [4]:
def define_model(encoder):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(encoder.vocab_size, 64),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    return model

In [5]:
class MovieReviewClassification(object):
    def __init__(self, learning_rate=1e-4, batch_size=64, epochs=2, local_data_dir='/app/tensorflow_datasets'):
        hyperparams = {'BUFFER_SIZE': 10000, 'BATCH_SIZE': batch_size}
        self.model_file = "lstm_trained"
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.train_dataset, self.test_dataset, self.encoder = data_loader(hyperparams, local_data_dir)
        
    def train(self):
        model = define_model(self.encoder)
        model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(self.learning_rate),
                      metrics=['accuracy'])
        history = model.fit(self.train_dataset, epochs=self.epochs,
                            validation_data=self.test_dataset,
                            validation_steps=30)
        model.save(self.model_file)
        test_loss, test_acc = model.evaluate(self.test_dataset)
        print('Test Loss: {}'.format(test_loss))
        print('Test Accuracy: {}'.format(test_acc))

In [6]:
if __name__ == "__main__":
    
    if USING_KATIB:
        parser = argparse.ArgumentParser(description="Using Katib for hyperparameter tuning")
        parser.add_argument("-lr", "--learning_rate", default="1e-4", help="Learning rate for the Keras optimizer")
        parser.add_argument("-bsz", "--batch_size", default="64", help="Batch size for each step of learning")
        parser.add_argument("-e", "--epochs", default="2", help="Number of epochs in each trial")
        args = parser.parse_args()
        learning_rate = float(args.learning_rate)
        batch_size = float(args.batch_size)
        epochs = float(args.epochs)
        model = MovieReviewClassification(learning_rate, batch_size, epochs, local_data_dir="~/tensorflow_datasets")
        model.train()
        
    else:
        #using Fairing
        GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
        DOCKER_REGISTRY = 'gcr.io/{}/fairing-job'.format(GCP_PROJECT)
        BuildContext = None
        FAIRING_BACKEND = 'KubeflowGKEBackend'
        BackendClass = getattr(importlib.import_module('kubeflow.fairing.backends'), FAIRING_BACKEND)

        data_files = ['tensorflow_datasets/downloads/ai.stanfor.edu_amaas_sentime_aclImdb_v1xA90oY07YfkP66HhdzDg046Ll8Bf3nAIlC6Rkj0WWP4.tar.gz', 
                      'tensorflow_datasets/downloads/ai.stanfor.edu_amaas_sentime_aclImdb_v1xA90oY07YfkP66HhdzDg046Ll8Bf3nAIlC6Rkj0WWP4.tar.gz.INFO',
                      'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/dataset_info.json',
                      'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/imdb_reviews-test.tfrecord-00000-of-00001',
                      'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/imdb_reviews-train.tfrecord-00000-of-00001',
                      'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/imdb_reviews-unsupervised.tfrecord-00000-of-00001',
                      'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/label.labels.txt',
                      'tensorflow_datasets/imdb_reviews/subwords8k/1.0.0/text.text.subwords',
                      'requirements.txt']
        
        train_job = TrainJob(MovieReviewClassification,
                              input_files=data_files, 
                              docker_registry=DOCKER_REGISTRY, 
                              backend=BackendClass(build_context_source=BuildContext))
        train_job.submit()


[W 200628 06:28:15 tasks:54] Using default base docker image: registry.hub.docker.com/library/python:3.6.9
[W 200628 06:28:15 tasks:62] Using builder: <class 'kubeflow.fairing.builders.cluster.cluster.ClusterBuilder'>
[I 200628 06:28:15 tasks:66] Building the docker image.
[I 200628 06:28:15 cluster:46] Building image using cluster builder.
[W 200628 06:28:15 base:94] /home/jovyan/.local/lib/python3.6/site-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 200628 06:28:15 base:107] Creating docker context: /tmp/fairing_context_spyuvd88
[W 200628 06:28:15 base:94] /home/jovyan/.local/lib/python3.6/site-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[W 200628 06:28:29 manager:298] Waiting for fairing-builder-lhs2f-qb8n9 to start...
[W 200628 06:28:30 manager:298] Waiting for fairing-builder-lhs2f-qb8n9 to start...
[W 200628 06:28:30 manager:298] Waiting for fairing-builder-lhs2f-qb8n9 to start...
[I 200628 06:28:3

E0628 06:28:36.784021       1 aws_credentials.go:77] while getting AWS credentials NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors
E0628 06:28:36.794868       1 metadata.go:248] Failed to unmarshal scopes: json: cannot unmarshal string into Go value of type []string
[36mINFO[0m[0005] Retrieving image manifest registry.hub.docker.com/library/python:3.6.9
E0628 06:28:37.243851       1 metadata.go:154] while reading 'google-dockercfg' metadata: http status code: 404 while fetching url http://metadata.google.internal./computeMetadata/v1/instance/attributes/google-dockercfg
E0628 06:28:37.245502       1 metadata.go:166] while reading 'google-dockercfg-url' metadata: http status code: 404 while fetching url http://metadata.google.internal./computeMetadata/v1/instance/attributes/google-dockercfg-url
[36mINFO[0m[0008] Retrieving image manifest registry.hub.docker.com/library/python:3.6.9
[36mINFO[0m[0010]

[W 200628 06:30:18 job:101] The job fairing-job-54xjr launched.
[W 200628 06:30:19 manager:298] Waiting for fairing-job-54xjr-8ljgq to start...
[W 200628 06:30:19 manager:298] Waiting for fairing-job-54xjr-8ljgq to start...
[W 200628 06:30:19 manager:298] Waiting for fairing-job-54xjr-8ljgq to start...
[I 200628 06:31:06 manager:304] Pod started running True


TFDS datasets with text encoding are deprecated and will be removed in a future version. Instead, you should use the plain text version and tokenize the text using `tensorflow_text` (See: https://www.tensorflow.org/tutorials/tensorflow_text/intro#tfdata_example)
Load dataset info from /app/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0
Reusing dataset imdb_reviews (/app/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0)
Constructing tf.data.Dataset for split None, from /app/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0
2020-06-28 06:31:12.033142: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-06-28 06:31:12.033310: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2020-06-28 06:31:12.033347: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be 

[W 200628 06:49:23 job:173] Cleaning up job fairing-job-54xjr...
