# Build a simple text classifier with tf.estimator and tensorflow_hub

In [1]:
# Install tensorflow_hub library
!pip install tensorflow_hub --user



In [2]:
## Suppress useless warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

### Load Data: This dataset contains movie reviews along with their associated binary sentiment polarity labels.

In [4]:
train_df = pd.read_csv('data/text/train_binary_sent.csv') 
test_df = pd.read_csv('data/text/test_binary_sent.csv')

print('Number of Samples in training: ', len(train_df))
print('Number of Samples in testing: ', len(test_df))
print('Training data looks like:\n')

# Sample
train_df.head(5)

Number of Samples in training:  6920
Number of Samples in testing:  1821
Training data looks like:



Unnamed: 0,label,sentence
0,1,"A stirring, funny and finally transporting re-..."
1,0,Apparently reassembled from the cutting-room f...
2,0,They presume their audience won't sit still fo...
3,1,This is a visually stunning rumination on love...
4,1,Jonathan Parker's Bartleby should have been th...


#### Look into few samples [0 = Negative, 1 = Positive]

In [5]:
print("Review:\n\n",train_df.iloc[4].sentence)
print("\n\nPolarity:",train_df.iloc[4].label)

Review:

 Jonathan Parker's Bartleby should have been the be-all-end-all of the modern-office anomie films.


Polarity: 1


# Lets Model Using tensorflow's Estimator framework:

### 1. Estimator framework provides input functions that wrap Pandas dataframes.



In [6]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(x=train_df, 
                                                     y=train_df["label"], 
                                                     num_epochs=None,
                                                     shuffle=True)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(x=train_df,
                                                             y=train_df["label"],
                                                             shuffle=False)

# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(x=test_df, 
                                                            y=test_df["label"], 
                                                            shuffle=False)

### 2. Using Feature Columns:

TF-Hub provides a feature column that applies a module on the given text feature and passes further the outputs of the module. In this tutorial we will be using the nnlm-en-dim128 module. For the purpose of this tutorial, the most important facts are:

1. The module takes a batch of sentences in a 1-D tensor of strings as input.
2. The module is responsible for preprocessing of sentences (e.g. removal of punctuation and splitting on spaces).
3. The module works with any input (e.g. nnlm-en-dim128 hashes words not present in vocabulary into ~20.000 buckets).

In [None]:
embedded_text_feature_column = hub.text_embedding_column(key="sentence", # embed the text
                                                         module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",# embedding type
                                                         trainable=True)

INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/1' already being downloaded by 'utsa-ai-75.4267.c0f4a933e2e04f05be0cc7007ce65b19'. Waiting.
INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/1' already being downloaded by 'utsa-ai-75.4267.c0f4a933e2e04f05be0cc7007ce65b19'. Waiting.
INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/1' already being downloaded by 'utsa-ai-75.4267.c0f4a933e2e04f05be0cc7007ce65b19'. Waiting.
INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/1' already being downloaded by 'utsa-ai-75.4267.c0f4a933e2e04f05be0cc7007ce65b19'. Waiting.
INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/1' already being downloaded by 'utsa-ai-75.4267.c0f4a933e2e04f05be0cc7007ce65b19'. Waiting.
INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/1' already being downloaded by 'utsa-ai-75.4267.c0f4a933e2e04f05be0cc7007ce65b19'. Waiting.
INFO:tensorflow:Module 'https://tfhub.dev/google/nnlm-en-dim128/

### 3. Estimator

For classification we can use a DNN Classifier, and specify the hidden layers and other properties of the neural network

In [None]:
estimator = tf.estimator.DNNClassifier(hidden_units=[512, 128],
                                       feature_columns=[embedded_text_feature_column],
                                       n_classes=2,
                                       activation_fn = tf.nn.relu,
                                       optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

### 4. Train

In [None]:
# Training for 1,000 steps means 128,000 training examples with the default
# batch size. This is roughly equivalent to 5 epochs since the training dataset
# contains 25,000 examples.

estimator.train(input_fn=train_input_fn, # training using train input function declared above
                steps=1000)

### 5. Evaluation

In [None]:
## Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

## EVAL
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)

print ("Training set accuracy: {accuracy}".format(**train_eval_result))
print ("Test set accuracy: {accuracy}".format(**test_eval_result))