# Build a simple text classifier with tf.estimator and tensorflow_hub

In [10]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

## Suppress useless warnings
import warnings
warnings.filterwarnings('ignore')

### Load Data: This dataset contains movie reviews along with their associated binary sentiment polarity labels.

In [11]:
train_df = pd.read_csv('data/sentiment_train.csv') 
test_df = pd.read_csv('data/sentiment_test.csv')

print('Number of Samples in training: ', len(train_df))
print('Number of Samples in testing: ', len(test_df))
print('Training data looks like:\n')

# Sample
train_df.head(5)

Number of Samples in training:  25000
Number of Samples in testing:  25000
Training data looks like:



Unnamed: 0,sentence,polarity
0,I found it very very difficulty to watch this ...,0
1,The mood of this movie is pretty good and it c...,0
2,American film makers decided to make a film th...,0
3,The music and Laurence Olivier's sombre delive...,1
4,I absolutely love this game to death. Ever sin...,1


#### Look into few samples [0 = Negative, 1 = Positive]

In [12]:
print("Review:\n\n",train_df.iloc[4].sentence)
print("\n\nPolarity:",train_df.iloc[4].polarity)

Review:

 I absolutely love this game to death. Ever since I was 9 years old (I am now 15). It has great graphics, characters, magic, weapons, additions, and don't forget the ultimately awesome dragoon forms! I am still waiting for a remake, prequel, or a sequel to this spectacular video game. <br /><br />You play as Dart, a young swordsman who has the potential to be quite the hero. On this adventure you encounter wondrous creatures and boss fights. You also encounter some friends on the way who have their own special element. Such as Fire, Darkness, Water/Ice, Thunder/Lightning, Earth, Light, and Wind. There are also items known as dragoon spirits, which allow you to transform into magical creatures of legend. Dragons, wizards, creatures called winglies and evil creatures you'll have to face on this adventure of action-packed thrills and excitement. One of my all time favorite games, The Legend of Dragoon!


Polarity: 1


# Lets Model Using tensorflow's Estimator framework:

### 1. Estimator framework provides input functions that wrap Pandas dataframes.



In [13]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(x=train_df, 
                                                     y=train_df["polarity"], 
                                                     num_epochs=None,
                                                     shuffle=True)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(x=train_df,
                                                             y=train_df["polarity"],
                                                             shuffle=False)

# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(x=test_df, 
                                                            y=test_df["polarity"], 
                                                            shuffle=False)

### 2. Using Feature Columns:

TF-Hub provides a feature column that applies a module on the given text feature and passes further the outputs of the module. In this tutorial we will be using the nnlm-en-dim128 module. For the purpose of this tutorial, the most important facts are:

1. The module takes a batch of sentences in a 1-D tensor of strings as input.
2. The module is responsible for preprocessing of sentences (e.g. removal of punctuation and splitting on spaces).
3. The module works with any input (e.g. nnlm-en-dim128 hashes words not present in vocabulary into ~20.000 buckets).

In [14]:
embedded_text_feature_column = hub.text_embedding_column(key="sentence", # embed the text
                                                         module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",# embedding type
                                                         trainable=True)

### 3. Estimator

For classification we can use a DNN Classifier, and specify the hidden layers and other properties of the neural network

In [15]:
estimator = tf.estimator.DNNClassifier(hidden_units=[512, 128],
                                       feature_columns=[embedded_text_feature_column],
                                       n_classes=2,
                                       activation_fn = tf.nn.relu,
                                       optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

### 4. Train

In [17]:


# Training for 1,000 steps means 128,000 training examples with the default
# batch size. This is roughly equivalent to 5 epochs since the training dataset
# contains 25,000 examples.

estimator.train(input_fn=train_input_fn, # training using train input function declared above
                steps=1000)

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f31d47e3630>

### 5. Evaluation

In [18]:
## Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

## EVAL
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)

print ("Training set accuracy: {accuracy}".format(**train_eval_result))
print ("Test set accuracy: {accuracy}".format(**test_eval_result))

Training set accuracy: 0.9914000034332275
Test set accuracy: 0.8652399778366089
