In [27]:
import keras, tensorflow, sys
keras.__version__, tensorflow.__version__, sys.version

('2.3.1',
 '2.1.0',
 '3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]')

In [28]:
import json
import pandas as pd
import numpy as np

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.utils.np_utils import to_categorical

from keras.layers import Dense, Input, Flatten, Lambda, Permute, GlobalMaxPooling1D, Activation, Concatenate
from keras.layers import Convolution1D, MaxPooling1D, Embedding, Dropout, Bidirectional, CuDNNGRU, SpatialDropout1D

from keras.models import Model

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

Load intents from:

https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines

In [29]:
# Create dataframe with intents
data = pd.DataFrame()

for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:

    with open("2017-06-custom-intent-engines/" + intent + "/train_" + intent + "_full.json",
              encoding='cp1251') as data_file:
        full_data = json.load(data_file)
        
    texts = []
    for i in range(len(full_data[intent])):
        text = ''
        for j in range(len(full_data[intent][i]['data'])):
            text += full_data[intent][i]['data'][j]['text']
        texts.append(text)

    dftrain = pd.DataFrame(data=texts, columns=['request'])
    dftrain[intent] = np.ones(dftrain.shape[0], dtype='int')

    data = data.append(dftrain, ignore_index=True, sort=False)

data = data.fillna(value=0)

data.shape

(13784, 8)

In [30]:
data.head(10)

Unnamed: 0,request,AddToPlaylist,BookRestaurant,GetWeather,PlayMusic,RateBook,SearchCreativeWork,SearchScreeningEvent
0,Add another song to the Cita RomГЎntica playli...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,add clem burke in my playlist Pre-Party R&B Jams,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Add Live from Aragon Ballroom to Trapeo,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,add Unite and Win to my night out,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Add track to my Digster Future Hits,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,add the piano bar to my Cindy Wilson,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Add Spanish Harlem Incident to cleaning the house,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,add The Greyest of Blue Skies in Indie EspaГ±o...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Add the name kids in the street to the plylist...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,add album radar latino,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Load pretrained embeddings:

https://nlp.stanford.edu/projects/glove/

In [31]:
def load_glove(word_index):
    EMBEDDING_FILE = 'D:/NLP Files/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8"))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = len(word_index)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= nb_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

Split the data

In [32]:
X_train, X_test, y_train, y_test = train_test_split(data["request"], data[["AddToPlaylist", "BookRestaurant",
                                                    "GetWeather", "PlayMusic", "RateBook", "SearchCreativeWork",
                                                    "SearchScreeningEvent"]], test_size=0.25)

Preprocessing of the data: tokenizing and padding

In [33]:
# Tokenize
X_train = list(X_train)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Hardcoded parameters
word_index = tokenizer.word_index
vocab_size = len(word_index)
max_sent_len = 100


# Pad
X_train = pad_sequences(X_train, maxlen=max_sent_len)
X_test = pad_sequences(X_test, maxlen=max_sent_len)

embedding_matrix = load_glove(word_index)

  exec(code_obj, self.user_global_ns, self.user_ns)


One hot vectors

In [34]:
y_train = np.argmax(np.array(y_train), axis=-1)
y_test = np.argmax(np.array(y_test), axis=-1)

Create model with pretrained glove vectors as embedding weights.
Using GRU (Gated Recurrent Unit) and Global Max Pooling 1D

In [35]:
sequence_input = Input(shape=(max_sent_len,), dtype='int32')

words = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], weights=[embedding_matrix],
                  trainable=True)(sequence_input)
words = Dropout(rate=0.3)(words)

output = Convolution1D(filters=256, filter_length=3, activation="tanh", padding='same', strides=1)(words)
output = Dropout(rate=0.3)(output)

# tf.contrib is not available in tf 2.0. Need to change it with something else
#output = Bidirectional(CuDNNGRU(units=64, return_sequences=True), merge_mode='concat')(output)
output_h = Activation('tanh')(output)

output1 = GlobalMaxPooling1D()(output_h) 

# Applying attention to RNN output
output = Dense(units=1)(output_h)
output = Permute((2, 1))(output)
output = Activation('softmax', name="attn_softmax")(output)
output = Lambda(lambda x: tf.matmul(x[0], x[1])) ([output, output_h])
output2 = Flatten() (output)

# Concatenating maxpooled and self attended features.
output = Concatenate()([output1, output2])
output = Dropout(rate=0.3)(output)

output = Dense(units=128, activation='tanh')(output)
output = Dropout(rate=0.3)(output)

output = Dense(units=32, activation='tanh')(output)
output = Dense(units=7, activation='softmax')(output)

model = Model(inputs=sequence_input, outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

model.summary()

  import sys


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 100, 300)     2908500     input_5[0][0]                    
__________________________________________________________________________________________________
dropout_13 (Dropout)            (None, 100, 300)     0           embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 100, 256)     230656      dropout_13[0][0]                 
____________________________________________________________________________________________

In [36]:
# 10 epochs and batch_size 256 looks the best
model.fit(X_train, np.array(y_train), epochs=10, batch_size=256)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x17d7223b548>

In [37]:
p = model.predict(X_test)
p = [np.argmax(i) for i in p]

print("f1_score (macro):", f1_score(y_test, p, average="macro"))
print("accuracy_score:", accuracy_score(y_test, p))

f1_score (macro): 0.9872489155064902
accuracy_score: 0.9872315728380732
