In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM,Dense, Conv1D, MaxPooling1D, Bidirectional, Dropout, Input, Embedding
from tensorflow.keras import optimizers
import numpy as np
from tensorflow.keras.callbacks import TensorBoard
from sklearn.utils import shuffle
import json

In [2]:
utterances = []
labels = []

with open("/Users/tdubon/GitHub/Mastering-spaCy/Chapter10/data/Chapter10/data/restaurants.json", "r") as jfile:
    data = json.load(jfile)
    
    for dialogue in data:
        turns = dialogue["turns"]
        for turn in turns:
            speaker = turn["speaker"]
            if speaker == "USER":
                utterance, intent = turn["utterance"], turn["intent"]
                label = 1 if intent == "FindRestaurants" else 0
                utterances.append(utterance)
                labels.append(label)

In [9]:
utterances[:10]

['I am feeling hungry so I would like to find a place to eat.',
 'I would like for it to be in San Jose.',
 'I usually like eating the American type of food.',
 'Can you give me the address of this restaurant.',
 'Can you give me the phone number that I can contact them with?',
 'Is there some other restaurant which you can suggest?',
 'Do you have another restaurant matching my needs? For example a restaurant which is economical and is located in Palo Alto.',
 'Alright, that seems good. I would like to make a booking at this restaurant.',
 'I will be eating there at 11:30 am so make it for then.',
 'That suits me well. Can you tell me if they feature live music?']

In [10]:
labels[:10]

[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]

In [11]:
len(utterances), len(labels)

(1233, 1233)

In [16]:
utterances, labels = shuffle(utterances, labels, random_state=0)

In [20]:
tokenizer = Tokenizer(char_level=True,filters=".,;'\"-", lower=True)
tokenizer.fit_on_texts(utterances)

In [21]:
tokenizer.word_index

{' ': 1,
 'e': 2,
 'a': 3,
 't': 4,
 'o': 5,
 'n': 6,
 'i': 7,
 'r': 8,
 's': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'u': 13,
 '.': 14,
 'm': 15,
 'c': 16,
 'y': 17,
 'f': 18,
 'p': 19,
 'k': 20,
 'g': 21,
 'w': 22,
 'v': 23,
 '?': 24,
 ',': 25,
 'b': 26,
 "'": 27,
 '1': 28,
 ':': 29,
 '0': 30,
 '3': 31,
 '5': 32,
 'x': 33,
 '4': 34,
 'q': 35,
 '2': 36,
 '!': 37,
 'z': 38,
 '7': 39,
 '6': 40,
 'j': 41,
 '-': 42,
 '8': 43,
 '9': 44,
 '"': 45,
 '`': 46}

In [22]:
utterances = tokenizer.texts_to_sequences(utterances)

In [23]:
mutt_len = max([len(ans) for ans in utterances])

print(mutt_len)

156


In [24]:
MAX_LEN = 150

In [25]:
utterances[0]

[17,
 2,
 9,
 25,
 1,
 7,
 1,
 22,
 3,
 6,
 4,
 1,
 7,
 4,
 1,
 5,
 6,
 1,
 4,
 10,
 2,
 1,
 28,
 28,
 4,
 10]

In [26]:
utterances = pad_sequences(utterances, MAX_LEN, padding="post")

In [27]:
utterances[0]

array([17,  2,  9, 25,  1,  7,  1, 22,  3,  6,  4,  1,  7,  4,  1,  5,  6,
        1,  4, 10,  2,  1, 28, 28,  4, 10,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [28]:
utterances, labels = np.array(utterances), np.array(labels)

In [29]:
utterances.shape, labels.shape

((1233, 150), (1233,))

In [30]:
utt_in = Input(shape=(MAX_LEN,))

embedding_layer =  Embedding(input_dim = len(tokenizer.word_index)+1, output_dim = 100, input_length=MAX_LEN)
lstm =  Bidirectional(LSTM(units=100, return_sequences=False))

utt_embedding = embedding_layer(utt_in)
utt_encoded = lstm(utt_embedding)

output = Dense(1, activation='sigmoid')(utt_encoded)

In [31]:
model = Model(utt_in, output)

In [32]:
model.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics=["accuracy"])

In [33]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 100)          4700      
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
classification_layer (Dense) (None, 1)                 201       
Total params: 165,701
Trainable params: 165,701
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.fit(utterances, labels, validation_split=0.1, epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f69d8431f60>