In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

  return f(*args, **kwds)


You have TensorFlow version 1.4.0


In [3]:
# The CSV was generated from this query: https://bigquery.cloud.google.com/savedquery/513927984416:c494494324be4a80b1fc55f613abb39c
# The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv
data = pd.read_csv("estimators-bow/so-export-0920.csv")

In [4]:
data.head()

Unnamed: 0,post,tags
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [5]:
# Confirm that we have a balanced dataset
# Note: data was randomly shuffled in our BigQuery query
data['tags'].value_counts()

c#               2000
asp.net          2000
ruby-on-rails    2000
c                2000
java             2000
android          2000
ios              2000
javascript       2000
php              2000
.net             2000
sql              2000
jquery           2000
angularjs        2000
python           2000
objective-c      2000
html             2000
css              2000
c++              2000
mysql            2000
iphone           2000
Name: tags, dtype: int64

In [6]:
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 32000
Test size: 8000


In [7]:
train_posts = data['post'][:train_size]
train_tags = data['tags'][:train_size]

test_posts = data['post'][train_size:]
test_tags = data['tags'][train_size:]

In [8]:
max_words = 1000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, char_level=False)

In [9]:
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts).astype(np.float32)
x_test = tokenize.texts_to_matrix(test_posts).astype(np.float32)

In [10]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [11]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes).astype(np.float32)
y_test = keras.utils.to_categorical(y_test, num_classes).astype(np.float32)

In [12]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (32000, 1000)
x_test shape: (8000, 1000)
y_train shape: (32000, 20)
y_test shape: (8000, 20)


In [13]:
# Build the model
model = keras.models.Sequential()
model.add(keras.layers.Dense(512, input_shape=(max_words,), name="posts"))
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(num_classes))
model.add(keras.layers.Activation('softmax', name="labels"))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [14]:
# Convert Keras model to estimator
estimator_model = keras.estimator.model_to_estimator(keras_model=model)

INFO:tensorflow:Using the Keras model from memory.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpr_wtvazo', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x108871780>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [15]:
def my_generator(data, labels):

    def gen():     
        for i in range(len(data)):     
            yield ({'posts_input' : data[i]}, (labels[i]))
           
    return gen
        
def train_input_function():
    dataset = (
        tf.data.Dataset
        .from_generator(my_generator(data=x_train, labels=y_train), 
                        output_types=({'posts_input': tf.float32}, tf.float32), 
                        output_shapes=({'posts_input': (max_words,)}, (num_classes,)))
        # describe structure of output_shapes (why dictionary + tuple?)
        # the word 'output' is confusing
        .batch(32)
        .repeat(2)
        # (num_examples / batch_size) * num_epochs (repeat)
    )

    # datasets api returns tuple of features and labels
    features, labels = dataset.make_one_shot_iterator().get_next()
    return features,labels


def test_input_function():
    dataset = (
        tf.data.Dataset
        .from_generator(my_generator(data=x_test, labels=y_test), ({'posts_input': tf.float32}, tf.float32), output_shapes=({'posts_input': (max_words,)}, (num_classes,)))
        .batch(32)
        .repeat(1) #1, since we want to test on every example in the test set, once
    )

    features, labels = dataset.make_one_shot_iterator().get_next()
    return features,labels

In [16]:
# Got this error w/o converting from 64 to 32: Tensor conversion requested dtype float64 for Tensor with dtype float32
estimator_model.train(input_fn=train_input_function)#, steps=10) #adding 'steps' overrides the default (batch * dataset size)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpr_wtvazo/
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpr_wtvazo/model.ckpt.
INFO:tensorflow:loss = 3.11502, step = 1
INFO:tensorflow:global_step/sec: 66.7331
INFO:tensorflow:loss = 1.51764, step = 101 (1.501 sec)
INFO:tensorflow:global_step/sec: 89.4373
INFO:tensorflow:loss = 1.00383, step = 201 (1.117 sec)
INFO:tensorflow:global_step/sec: 98.7426
INFO:tensorflow:loss = 0.848253, step = 301 (1.013 sec)
INFO:tensorflow:global_step/sec: 96.9965
INFO:tensorflow:loss = 0.49097, step = 401 (1.031 sec)
INFO:tensorflow:global_step/sec: 94.1459
INFO:tensorflow:loss = 0.709625, step = 501 (1.062 sec)
INFO:tensorflow:global_step/sec: 95.6352
INFO:tensorflow:loss = 0.972857, step = 601 (1.045 sec)
INFO:tensorflow:global_step/sec: 96.8555
INFO:tensorflow:loss = 1.04335, step = 701 (1.033 sec)
INFO:tensorflow:g

<tensorflow.python.estimator.estimator.Estimator at 0x11938c4e0>

In [17]:
# Evaluate the accuracy of our trained model
score = estimator_model.evaluate(input_fn=test_input_function)
print(score)

INFO:tensorflow:Starting evaluation at 2017-11-14-20:06:16
INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpr_wtvazo/model.ckpt-2001
INFO:tensorflow:Finished evaluation at 2017-11-14-20:06:17
INFO:tensorflow:Saving dict for global step 2001: accuracy = 0.8095, global_step = 2001, loss = 0.58087
{'accuracy': 0.80949998, 'loss': 0.58087039, 'global_step': 2001}


In [24]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

# No labels in the predict generator
def my_predict_generator(data):
    def gen():     
        for i in range(len(data)):     
            yield ({'posts_input' : data[i]})
    return gen

def predict_input_function():
    dataset = (
        tf.data.Dataset
        .from_generator(my_predict_generator(data=x_predict), 
                        ({'posts_input': tf.float32}), 
                        output_shapes=({'posts_input': (max_words,)}))
        .batch(32)
        .repeat(1) #1, since we want to test on every example in the test set, once
    )

    features = dataset.make_one_shot_iterator().get_next()
    return features
  
# Create some dummy "predict" data
# Here, we'll just use a few instances from test
num_predict_examples = 10
x_predict = x_test[:num_predict_examples]
actual_labels = y_test[:num_predict_examples]

predictions = list(estimator_model.predict(input_fn=predict_input_function))
# TODO: is this still a problem?
# https://github.com/tensorflow/tensorflow/issues/11621
# WARNING:tensorflow:Input graph does not contain a QueueRunner. That means predict yields forever. This is probably a mistake.


for i in range(num_predict_examples):
    prediction_array = list(predictions[i]['labels'])
    predicted_label_idx = np.argmax(prediction_array)
    predicted_label_text = text_labels[predicted_label_idx]
    
    print("Original SO post", test_posts.iloc[i][:50], "...")
    print('Actual label:', text_labels[np.argmax(actual_labels[i])])
    print("Predicted label: ", predicted_label_text + "\n") 

INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmpr_wtvazo/model.ckpt-2001
Original SO post jquery( tr_selector ) is removing the style values ...
Actual label: jquery
Predicted label:  jquery

Original SO post web reference in wcf  in wcf if we use webreferenc ...
Actual label: .net
Predicted label:  asp.net

Original SO post fragment transaction custom animation - android  i ...
Actual label: android
Predicted label:  android

Original SO post why can i not use the namespace directive in c++ s ...
Actual label: c++
Predicted label:  c++

Original SO post generating password reset link in c# .net  i must  ...
Actual label: .net
Predicted label:  c#

Original SO post php 5.3 $this versus php 5.4  i am calling a membe ...
Actual label: php
Predicted label:  php

Original SO post undefined method `request_uri  actiondispatch  i w ...
Actual label: ruby-on-rails
Predicted label:  ruby-on-rails

Original SO post my table view is not showing data 