In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow import keras

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

  return f(*args, **kwds)


You have TensorFlow version 1.4.0


In [3]:
# The CSV was generated from this query: https://bigquery.cloud.google.com/savedquery/513927984416:c494494324be4a80b1fc55f613abb39c
# The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv
data = pd.read_csv("stack-overflow-data.csv")

In [4]:
data.head()

Unnamed: 0,post,tags
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [5]:
# Confirm that we have a balanced dataset
# Note: data was randomly shuffled in our BigQuery query
data['tags'].value_counts()

jquery           2000
c++              2000
javascript       2000
ios              2000
java             2000
sql              2000
html             2000
angularjs        2000
mysql            2000
android          2000
objective-c      2000
c                2000
asp.net          2000
python           2000
c#               2000
ruby-on-rails    2000
iphone           2000
php              2000
.net             2000
css              2000
Name: tags, dtype: int64

In [6]:
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 32000
Test size: 8000


In [7]:
train_posts = data['post'][:train_size]
train_tags = data['tags'][:train_size]

test_posts = data['post'][train_size:]
test_tags = data['tags'][train_size:]

In [8]:
max_words = 1000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, char_level=False)

In [9]:
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts).astype(np.float32) # model expects float32
x_test = tokenize.texts_to_matrix(test_posts).astype(np.float32)

In [10]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [11]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes).astype(np.float32)
y_test = keras.utils.to_categorical(y_test, num_classes).astype(np.float32)

In [12]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (32000, 1000)
x_test shape: (8000, 1000)
y_train shape: (32000, 20)
y_test shape: (8000, 20)


In [13]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 2

In [14]:
# Build the model
model = keras.models.Sequential()
model.add(keras.layers.Dense(512, input_shape=(max_words,), name="posts"))
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(num_classes))
model.add(keras.layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [15]:
# Convert Keras model to estimator
estimator_model = keras.estimator.model_to_estimator(keras_model=model)

INFO:tensorflow:Using the Keras model from memory.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x10254de10>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [16]:
# Define input fn
def input_function(features, labels=None, shuffle=False):
    input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"posts_input": features}, # See the accompanying blog post for explanation on naming
        y=labels,
        shuffle=shuffle
    )
    return input_fn

In [17]:
estimator_model.train(input_fn=input_function(x_train, y_train, shuffle=True))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt.
INFO:tensorflow:loss = 3.0097, step = 1
INFO:tensorflow:global_step/sec: 96.0292
INFO:tensorflow:loss = 1.06048, step = 101 (1.044 sec)
INFO:tensorflow:global_step/sec: 97.6674
INFO:tensorflow:loss = 0.688308, step = 201 (1.022 sec)
INFO:tensorflow:Saving checkpoints for 251 into /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt.
INFO:tensorflow:Loss for final step: 0.855072.


<tensorflow.python.estimator.estimator.Estimator at 0x119350748>

In [18]:
# Evaluate the accuracy of our trained model
score = estimator_model.evaluate(input_function(x_test, labels=y_test))
print(score)

INFO:tensorflow:Starting evaluation at 2017-12-18-23:30:47
INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt-251
INFO:tensorflow:Finished evaluation at 2017-12-18-23:30:48
INFO:tensorflow:Saving dict for global step 251: accuracy = 0.813244, global_step = 251, loss = 0.644977
{'accuracy': 0.81324404, 'loss': 0.64497727, 'global_step': 251}


In [20]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

# We'll make predictions for the first five examples
examples = x_test[:5]
predictions = list(estimator_model.predict(input_function(examples)))

# Print out the true and expected label
for i in range(len(examples)):
    prediction_array = list(predictions[i].values())[0]
    predicted_label = text_labels[np.argmax(prediction_array)]
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label + "\n")  

INFO:tensorflow:Restoring parameters from /var/folders/1h/g9jk9_kx67d6g0_gyfnvk1n4008m_k/T/tmp0x6nb2bs/model.ckpt-251
Actual label:jquery
Predicted label: jquery

Actual label:.net
Predicted label: .net

Actual label:android
Predicted label: android

Actual label:c++
Predicted label: c++

Actual label:.net
Predicted label: c#

