In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install nltk tensorflow-hub

[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import preprocessor  # this will download some data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import preprocessor
import pandas as pd
import pickle
import os

In [4]:
def maybe_process(store_file="data.pkl", force=False):
    if force or not os.path.exists(store_file):
        data = preprocessor.process_dir("dump")
        with open(store_file, 'wb') as data_file:
            pickle.dump(data, data_file)
    else:
        with open(store_file, 'rb') as data_file:
            data = pickle.load(data_file)
    return data

In [5]:
data = maybe_process()

In [6]:
df = pd.DataFrame(data)
df = df.fillna("Unknown")

In [7]:
user_vocabulary = pd.concat([df["assignee"], df["reporter"]]).unique()

In [8]:
with open("vocabulary.pkl", 'wb') as vocabulary_file:
    pickle.dump(user_vocabulary, vocabulary_file)

In [9]:
vocabulary_map = {"Unknown": 0}
count = 0
for el in user_vocabulary:
    count += 1
    vocabulary_map[el] = count

In [10]:
with open("vocabulary.pkl", 'wb') as vocabulary_file:
    pickle.dump(user_vocabulary, vocabulary_file)

with open("vocabulary_map.pkl", 'wb') as vocabulary_file:
    pickle.dump(vocabulary_map, vocabulary_file)

Start tensorflow stuff here

In [11]:
import tensorflow as tf
import tensorflow_hub as hub
import shutil
import numpy as np

print(tf.__version__)
tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

1.9.0


In [12]:
df.head()

Unnamed: 0,assignee,description,id,key,reporter,status,summary
0,yc14sv1,\r\nNavapp Unknown field error:\r\n\r\nTue Sep...,376133,WDSDO-9863,m681174,Closed,NavApp Unknown field error Error - in C4D
1,yc03464,Add simple way to set required variables for k...,745534,DEVOPS-2005,yc03464,Closed,KitchenLib: kitchenHeader simple setup for the...
2,yc04492,"Hi DevOps,\r\nDeployments are failing in 16D c...",577037,WDSDO-19043,yc05vv1,Closed,Deployments are failing in 16D customer pipeline
3,yc14am4,http://platform-build:8080/jenkins/job/SDP_UN_...,354338,WDSDO-8489,yc14im2,Closed,FCC Keyspace Is Not Found
4,yc14db1,Customer Domain improved CI pipeline,348404,WDSDO-8085,m086782,Closed,Customer Domain improved CI pipeline


In [13]:
df.describe()

Unnamed: 0,assignee,description,id,key,reporter,status,summary
count,41397,41397,41397,41397,41397,41397,41397
unique,1121,40159,41397,41397,2151,15,39824
top,yc14tz1,TBD,474129,WDSDO-6276,yc14sv1,Closed,Discovery build failed
freq,3832,104,1,1,1002,36637,22


Now, split the data into two parts -- training and evaluation.

In [14]:
np.random.seed(seed=1) #makes result reproducible
msk = np.random.rand(len(df)) < 0.8
traindf = df[msk]
evaldf = df[~msk]

In [15]:
def add_more_features(df):
    # TODO: Add more features to the dataframe
    df["summary_clean"] = df["summary"].apply(lambda x: " ".join(preprocessor.process_text(x)))
    df["description_clean"] = df["description"].apply(lambda x: " ".join(preprocessor.process_text(x)))
    return df

In [16]:
# Create pandas input function
def make_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = add_more_features(df),
#         y = df["assignee"].fillna('Unknown').apply(lambda x: vocabulary_map[x]),
        y = df["assignee"],
        batch_size = 128,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1000,
        num_threads = 1
      )

In [17]:
# Define your feature columns
def create_feature_cols():
    return [
#         tf.feature_column.categorical_column_with_vocabulary_list("reporter", 
#                                                                   vocabulary_list=user_vocabulary.tolist(), 
#                                                                   default_value=0)


        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                "reporter", vocabulary_list=user_vocabulary.tolist(), 
                default_value=0),
            1500,
        ),
        
#         hub.text_embedding_column(
#             key="description_clean", 
#             module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",
#             trainable=True,
#         ),
        
#         hub.text_embedding_column(
#             key="summary_clean", 
#             module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",
#             trainable=True,
#         ),
        
        hub.text_embedding_column(
            key="description_clean", 
            module_spec="https://tfhub.dev/google/Wiki-words-500-with-normalization/1",
            trainable=True,
        ),
        
        hub.text_embedding_column(
            key="summary_clean", 
            module_spec="https://tfhub.dev/google/Wiki-words-500-with-normalization/1",
            trainable=True,
        ),
        
    ]

In [18]:
# Create estimator train and evaluate function
def train_and_evaluate(output_dir, num_train_steps):
    estimator = tf.estimator.DNNClassifier(
        model_dir = output_dir,
        feature_columns = create_feature_cols(),
        hidden_units=[200, 100, 50],
        n_classes=len(user_vocabulary.tolist()),
        label_vocabulary = user_vocabulary.tolist(),
    )
    
#     estimator = tf.estimator.LinearClassifier(
#         model_dir = output_dir, 
#         feature_columns = create_feature_cols(),
#         n_classes=len(user_vocabulary.tolist()),
#         label_vocabulary = user_vocabulary.tolist(),
#     )
    train_spec = tf.estimator.TrainSpec(input_fn = make_input_fn(traindf, None),
                                        max_steps = num_train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn = make_input_fn(evaldf, 1), 
                                      steps = None, 
                                      start_delay_secs = 1, # start evaluating after N seconds, 
                                      throttle_secs = 10)  # evaluate every N seconds
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [19]:
OUTDIR = './trained_model'

In [None]:
# Run the model
shutil.rmtree(OUTDIR, ignore_errors = True)
train_and_evaluate(OUTDIR, 2000)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/Wiki-words-500-with-normalization/1'.
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/Wiki-words-500-with-normalization/1'.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_model_dir': './trained_model', '_save_checkpoints_steps': None, '_num_worker_replicas': 1, '_task_type': 'worker', '_task_id': 0, '_num_ps_replicas': 0, '_is_chief': True, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_train_distribute': None, '_save_summary_steps': 100, '_tf_random_seed': None, '_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb52e07ed30>, '_keep_checkpoint_max': 5, '_master': '', '_log_step_count_steps': 100, '_device_fn': None, '_evaluation_master': ''}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 10 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_0:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_1:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_2:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_3:0 from checkpoint b'/tmp/tfhub_modules/f002061d9dee6acda3f90d591a65dbab7627f665/variables/variables' with embeddings
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/summary_clean_hub_module_embedding/module/embeddings/part_0:0,dnn/input_from_feature_col

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./trained_model/model.ckpt-2
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2 into ./trained_model/model.ckpt.
INFO:tensorflow:loss = 908.7196, step = 3
INFO:tensorflow:Saving checkpoints for 3 into ./trained_model/model.ckpt.
INFO:tensorflow:Loss for final step: 908.7196.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_0:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_1:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_2:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/emb

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-26-21:16:14
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./trained_model/model.ckpt-5
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-26-21:16:18
INFO:tensorflow:Saving dict for global step 5: accuracy = 0.041686807, average_loss = 13.048544, global_step = 5, loss = 1661.3807
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5: ./trained_model/model.ckpt-5
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_0:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_1:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_2:0,dnn/input_from_feature_colu

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./trained_model/model.ckpt-7
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 7 into ./trained_model/model.ckpt.
INFO:tensorflow:loss = 855.9408, step = 8
INFO:tensorflow:Saving checkpoints for 8 into ./trained_model/model.ckpt.
INFO:tensorflow:Loss for final step: 855.9408.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_0:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_1:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_2:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/emb

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-26-21:28:02
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./trained_model/model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-26-21:28:06
INFO:tensorflow:Saving dict for global step 10: accuracy = 0.0219913, average_loss = 6.3029327, global_step = 10, loss = 802.5088
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10: ./trained_model/model.ckpt-10
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_0:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_1:0,dnn/input_from_feature_columns/input_layer/description_clean_hub_module_embedding/module/embeddings/part_2:0,dnn/input_from_feature_co