# HAL-9000 intent classification demo

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
import nltk
from pymagnitude import *
from nltk.stem.lancaster import LancasterStemmer

## load intents dataset

In [2]:
X = []
y = []
with open("/Users/zetong/intents.csv", mode = 'r', encoding = 'ascii', errors = 'ignore') as csvfile:
    intents = pd.read_csv(csvfile)
    X = list(intents["utterances"])
    y = list(intents["labels"])
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
X = np.asarray(X)

In [3]:
le.classes_.shape

(22,)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [5]:
vectors = Magnitude("/Users/zetong/Downloads/glove.840B.300d.magnitude")

In [6]:
MAX_SEQ_LEN = 28
i = tf.keras.layers.Input(shape=(MAX_SEQ_LEN, vectors.dim))
Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, activation="tanh", return_sequences=True),merge_mode="concat",)(i)
maxpool = tf.keras.layers.GlobalMaxPooling1D()(Bidir_LSTM)
hidden = tf.keras.layers.Dense(32)(maxpool)
dropout = tf.keras.layers.Dropout(0.3)(hidden)
output = tf.keras.layers.Dense(le.classes_.shape[0], activation="softmax")(dropout)
model = tf.keras.Model(inputs=i, outputs=output)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
model.summary()

W0909 18:44:29.469933 4521047488 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0909 18:44:29.480068 4521047488 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0909 18:44:29.480978 4521047488 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensor

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 28, 300)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 28, 64)            85248     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 22)                726       
Total params: 88,054
Trainable params: 88,054
Non-trainable params: 0
_________________________________________________________

In [7]:
def make_dataset(xarr, yarr):
    dataset = tf.data.Dataset.from_tensor_slices((xarr, yarr)).repeat()
    
    def _process_string(x):

        # x is numpy array
        def _pad_zeros(x, MAX_SEQ_LEN):
            if x.shape[0] >= MAX_SEQ_LEN:
                return x[0:MAX_SEQ_LEN, :]
            else:
                return np.concatenate(
                    (x, np.zeros((MAX_SEQ_LEN - x.shape[0], x.shape[1]))), axis=0
                )
        stemmer = LancasterStemmer()
        x = x.numpy().decode()
        x = word_tokenize(x)
        x = [stemmer.stem(i) for i in x]
        if len(x) != 0:
            x = vectors.query(x)
            x = _pad_zeros(x, MAX_SEQ_LEN)
        else:
            x = np.zeros((MAX_SEQ_LEN, vectors.dim))
        return x
    
    def _process_datapair(X, y):
        X = tf.py_function(_process_string, [X], tf.float32)
        X.set_shape([MAX_SEQ_LEN, vectors.dim])
        y.set_shape([])
        return X, y
    
    dataset = dataset.map(_process_datapair)
    return dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(batch_size)

In [8]:
batch_size = 32
train = make_dataset(X_train, y_train)
val = make_dataset(X_val, y_val)
test = make_dataset(X_test, y_test)

In [9]:
stopping_early = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=20)
filename = 'HAL-9000.h5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filename, monitor='val_loss', save_best_only=True, mode='min')
model.fit(train, validation_data=val, callbacks=[stopping_early, checkpoint], validation_steps = X_val.shape[0] / batch_size, steps_per_epoch=X_train.shape[0] / batch_size, epochs=100)

W0909 18:44:33.970619 4521047488 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/100


W0909 18:44:36.098089 123145500016640 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0909 18:44:36.119205 123145500016640 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0909 18:44:36.137377 123145500016640 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0909 18:44:36.145350 123145499480064 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0909 18:44:36.156615 123145500016640 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


<tensorflow.python.keras.callbacks.History at 0x1d6b75c88>

In [10]:
model.evaluate(test, steps=X_test.shape[0] / batch_size)



[0.35688852738703924, 0.9140625]

In [11]:
HAL_9000 = tf.keras.models.load_model("/Users/zetong/HAL-9000.h5")