In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [2]:
vocab_size = 3000 # size of vocabulary
embedding_dim = 64
max_length = 20
training_portion = .80 # set ratio of train (80%) and validation (20%)

In [3]:
list_of_questions = []
labels = []

In [4]:
# Read data and remove stopword
with open("data/train_5500.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        question = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            question = question.replace(token, ' ')
            question = question.replace(' ', ' ')
        list_of_questions.append(question)
print(len(labels))
print(len(list_of_questions))

5452
5452


In [5]:
train_size = int(len(list_of_questions) * training_portion)
train_questions = list_of_questions[0: train_size]
train_labels = labels[0: train_size]
validation_questions = list_of_questions[train_size:]
validation_labels = labels[train_size:]

tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(train_questions)
word_index = tokenizer.word_index

In [7]:
dict(list(word_index.items())[0:100]) ## print out first 100 index of vocabulary
train_sequences = tokenizer.texts_to_sequences(train_questions)

In [8]:
# First of 50 records in token form
for i in range(50):
    print(train_sequences[i])

[2, 2918, 1777, 691, 910]
[1, 572, 479, 78, 911, 2919]
[2, 9, 20, 480, 1224, 33, 245, 121]
[1, 2920, 2921, 2922, 344, 21, 1225]
[1, 295, 171, 345]
[1, 1778, 1779, 912, 1226, 1780]
[1, 72, 73, 3, 217, 692, 1781, 122]
[1, 413, 913]
[1, 1782, 2923]
[6, 2924, 1783, 914, 1784, 12, 59, 172]
[15, 2925, 2926, 80]
[18, 1785, 1786, 272, 2927, 2928]
[4, 12, 2929, 1227]
[4, 114, 915]
[1, 573, 2930, 2931, 916, 2932, 273, 1783]
[1, 2933, 14, 13, 25, 218, 1787]
[1, 2934, 917, 14, 13, 918, 1788]
[2, 7, 2935, 919, 2936, 2937, 2938]
[1, 414, 1789, 2939, 5]
[1, 2940, 1228]
[1, 130, 346, 49]
[1, 1790, 920, 2941, 347]
[6, 693, 50, 2942]
[1, 3, 348, 574]
[1, 40, 6, 921, 33]
[1, 3, 100, 922, 45, 1791, 101]
[4, 1792, 694, 923]
[1, 81, 2943, 115, 82]
[6, 415, 924, 1793, 481]
[19, 28, 82, 2944, 2945, 575]
[1, 195, 925, 55]
[1, 2946, 1229]
[2, 7, 695, 68, 696, 2947, 697, 576]
[19, 42, 1794, 482, 53, 2948, 926, 246, 173, 5]
[2, 7, 1795, 2949, 296, 347]
[1, 1230, 416]
[15, 2950, 58, 56]
[29, 698, 2951, 1231, 1796]

In [9]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

In [10]:
# First of 50 records after padding to size 20
for i in range(50):
    print(train_padded[i])

[   2 2918 1777  691  910    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[   1  572  479   78  911 2919    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[   2    9   20  480 1224   33  245  121    0    0    0    0    0    0
    0    0    0    0    0    0]
[   1 2920 2921 2922  344   21 1225    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[  1 295 171 345   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
[   1 1778 1779  912 1226 1780    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[   1   72   73    3  217  692 1781  122    0    0    0    0    0    0
    0    0    0    0    0    0]
[  1 413 913   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
[   1 1782 2923    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
[   6 2924 1783  914 1784   12   59  172    0    0    0    0    0    0
    0    0    0    0    0    0]
[  15 2925

In [11]:
validation_sequences = tokenizer.texts_to_sequences(validation_questions)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post', truncating='post')

In [12]:
# set of lables
print(set(labels))

{'LOC', 'NUM', 'HUM', 'ABBR', 'DESC', 'ENTY'}


In [13]:
# label to token
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [14]:
# First of 50 labels (token form)
for i in range(50):
    print(training_label_seq[i])

[3]
[1]
[3]
[1]
[6]
[2]
[2]
[2]
[3]
[2]
[4]
[3]
[2]
[2]
[1]
[5]
[3]
[4]
[3]
[3]
[4]
[1]
[2]
[3]
[3]
[1]
[2]
[5]
[1]
[5]
[6]
[1]
[4]
[2]
[4]
[3]
[4]
[1]
[5]
[1]
[3]
[1]
[4]
[4]
[3]
[2]
[1]
[2]
[3]
[1]


In [15]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Checking encode and original
def decode_question(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print('------------------------')
print(decode_question(train_padded[20]))
print(train_questions[20])
print('------------------------')

------------------------
what date boxing day ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
What date Boxing Day ?
------------------------


In [16]:
# Use tf.keras.layers.Bidirectional(tf.keras.layers.LSTM()).
# Use ReLU in place of tanh function.
# Add a Dense layer with 7 units and softmax activation.

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(7, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          192000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 455       
Total params: 266,759
Trainable params: 266,759
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
# Traing model with 15 epochs
num_epochs = 15
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Train on 4361 samples, validate on 1091 samples
Epoch 1/15
4361/4361 - 7s - loss: 1.4846 - accuracy: 0.3843 - val_loss: 1.0704 - val_accuracy: 0.5683
Epoch 2/15
4361/4361 - 2s - loss: 0.6539 - accuracy: 0.7732 - val_loss: 0.5709 - val_accuracy: 0.7754
Epoch 3/15
4361/4361 - 2s - loss: 0.3043 - accuracy: 0.8989 - val_loss: 0.5829 - val_accuracy: 0.7782
Epoch 4/15
4361/4361 - 2s - loss: 0.1510 - accuracy: 0.9569 - val_loss: 0.6202 - val_accuracy: 0.7984
Epoch 5/15
4361/4361 - 2s - loss: 0.0947 - accuracy: 0.9768 - val_loss: 0.6201 - val_accuracy: 0.8139
Epoch 6/15
4361/4361 - 2s - loss: 0.0709 - accuracy: 0.9814 - val_loss: 0.6916 - val_accuracy: 0.8011
Epoch 7/15
4361/4361 - 2s - loss: 0.0611 - accuracy: 0.9846 - val_loss: 0.7962 - val_accuracy: 0.8020
Epoch 8/15
4361/4361 - 3s - loss: 0.0509 - accuracy: 0.9876 - val_loss: 0.7878 - val_accuracy: 0.8002
Epoch 9/15
4361/4361 - 3s - loss: 0.0447 - accuracy: 0.9892 - val_loss: 0.8889 - val_accuracy: 0.7993
Epoch 10/15
4361/4361 - 3s - loss:

In [19]:
# Predict input text
question_input = ["What metal has the highest melting point ?"]
seq = tokenizer.texts_to_sequences(question_input)
padded = pad_sequences(seq, maxlen=max_length)
prediction = model.predict(padded)
print(prediction)
print(labels[np.argmax(prediction)])

[[1.2480495e-05 6.6372859e-03 1.4034165e-03 9.4354695e-01 3.0001828e-02
  3.4741121e-03 1.4923892e-02]]
ENTY
